In [1]:
%load_ext lab_black

In [2]:
import numpy as np
import pandas as pd

In [3]:
import json
import itertools

from pathlib import Path

In [4]:
from mppsteel.config.model_scenarios import DEFAULT_SCENARIO
from mppsteel.config.model_config import USD_TO_EUR_CONVERSION_DEFAULT

from mppsteel.config.reference_lists import RESOURCE_CATEGORY_MAPPER

from mppsteel.data_preprocessing.variable_plant_cost_archetypes import (
    plant_variable_costs,
    generate_feedstock_dict,
    generate_variable_costs,
    vc_mapper,
)

from mppsteel.utility.file_handling_utility import (
    read_pickle_folder,
    get_scenario_pkl_path,
)

from mppsteel.config.model_config import (
    PKL_DATA_FORMATTED,
    PKL_DATA_IMPORTS,
    MODEL_YEAR_RANGE,
)

In [5]:
scenario_dict = DEFAULT_SCENARIO.copy()
scenario_dict["usd_to_eur"] = USD_TO_EUR_CONVERSION_DEFAULT
scenario_dict["eur_to_usd"] = 1.0 / scenario_dict["usd_to_eur"]

In [6]:
%%time
df_reference = plant_variable_costs(scenario_dict)

Variable Cost Loop: 100%|██████████████████████████████████████████████████████| 2356/2356 [02:33<00:00, 15.33it/s]


CPU times: user 2min 30s, sys: 4.54 s, total: 2min 35s
Wall time: 2min 35s


In [7]:
intermediate_path = get_scenario_pkl_path(
    scenario_dict["scenario_name"], "intermediate"
)
eur_to_usd_rate = scenario_dict["eur_to_usd"]

steel_plants = read_pickle_folder(PKL_DATA_FORMATTED, "steel_plants_processed", "df")
steel_plant_region_ng_dict = (
    steel_plants[["country_code", "cheap_natural_gas"]]
    .set_index("country_code")
    .to_dict()["cheap_natural_gas"]
)
power_grid_prices_ref = read_pickle_folder(
    intermediate_path, "power_grid_prices_ref", "df"
)
h2_prices_ref = read_pickle_folder(intermediate_path, "h2_prices_ref", "df")
bio_model_prices_ref = read_pickle_folder(
    intermediate_path, "bio_model_prices_ref", "df"
)
ccs_model_transport_ref = read_pickle_folder(
    intermediate_path, "ccs_model_transport_ref", "df"
)
ccs_model_storage_ref = read_pickle_folder(
    intermediate_path, "ccs_model_storage_ref", "df"
)
business_cases = read_pickle_folder(
    PKL_DATA_FORMATTED, "standardised_business_cases", "df"
).reset_index()
static_energy_prices = read_pickle_folder(
    PKL_DATA_IMPORTS, "static_energy_prices", "df"
)[["Metric", "Year", "Value"]]
static_energy_prices.set_index(["Metric", "Year"], inplace=True)
feedstock_dict = generate_feedstock_dict(eur_to_usd_rate)
steel_plant_country_codes = list(steel_plants["country_code"].unique())
product_range_year_country = list(
    itertools.product(MODEL_YEAR_RANGE, steel_plant_country_codes)
)

# Consolidate Input Data

In [125]:
class ModelInput:
    def __init__(
        self,
        product_range_year_country,
        resource_category_mapper,
        business_cases,
        power_grid_prices_ref,
        h2_prices_ref,
        bio_model_prices_ref,
        year_range,
        ccs_model_storage_ref,
        ccs_model_transport_ref,
        steel_plant_region_ng_dict,
        static_energy_prices,
        feedstock_dict,
        country_codes,
    ):
        self.product_range_year_country = product_range_year_country
        self.resource_category_mapper = resource_category_mapper
        self.business_cases = business_cases
        self.power_grid_prices_ref = power_grid_prices_ref
        self.h2_prices_ref = h2_prices_ref
        self.bio_model_prices_ref = bio_model_prices_ref
        self.year_range = year_range
        self.ccs_model_storage_ref = ccs_model_storage_ref
        self.ccs_model_transport_ref = ccs_model_transport_ref
        self.steel_plant_region_ng_dict = steel_plant_region_ng_dict
        self.static_energy_prices = static_energy_prices
        self.feedstock_dict = feedstock_dict
        self.country_codes = country_codes
        self._df_years_and_country_codes = None

    @classmethod
    def from_filesystem(
        cls,
        resource_category_mapper=RESOURCE_CATEGORY_MAPPER,
        year_range=MODEL_YEAR_RANGE,
    ):
        intermediate_path = get_scenario_pkl_path(
            scenario_dict["scenario_name"], "intermediate"
        )
        eur_to_usd_rate = scenario_dict["eur_to_usd"]

        steel_plants = read_pickle_folder(
            PKL_DATA_FORMATTED, "steel_plants_processed", "df"
        )
        steel_plant_region_ng_dict = (
            steel_plants[["country_code", "cheap_natural_gas"]]
            .set_index("country_code")
            .to_dict()["cheap_natural_gas"]
        )
        power_grid_prices_ref = read_pickle_folder(
            intermediate_path, "power_grid_prices_ref", "df"
        )
        h2_prices_ref = read_pickle_folder(intermediate_path, "h2_prices_ref", "df")
        bio_model_prices_ref = read_pickle_folder(
            intermediate_path, "bio_model_prices_ref", "df"
        )
        ccs_model_transport_ref = read_pickle_folder(
            intermediate_path, "ccs_model_transport_ref", "df"
        )
        ccs_model_storage_ref = read_pickle_folder(
            intermediate_path, "ccs_model_storage_ref", "df"
        )
        business_cases = read_pickle_folder(
            PKL_DATA_FORMATTED, "standardised_business_cases", "df"
        ).reset_index()
        static_energy_prices = read_pickle_folder(
            PKL_DATA_IMPORTS, "static_energy_prices", "df"
        )[["Metric", "Year", "Value"]]
        static_energy_prices.set_index(["Metric", "Year"], inplace=True)
        feedstock_dict = generate_feedstock_dict(eur_to_usd_rate)
        country_codes = list(steel_plants["country_code"].unique())
        product_range_year_country = list(
            itertools.product(MODEL_YEAR_RANGE, steel_plant_country_codes)
        )
        return cls(
            product_range_year_country,
            resource_category_mapper,
            business_cases,
            power_grid_prices_ref,
            h2_prices_ref,
            bio_model_prices_ref,
            year_range,
            ccs_model_storage_ref,
            ccs_model_transport_ref,
            steel_plant_region_ng_dict,
            static_energy_prices,
            feedstock_dict,
            country_codes,
        )

    def create_df_from_years_and_contry_codes(self):
        if self._df_years_and_country_codes is None:
            self._df_years_and_country_codes = pd.merge(
                pd.DataFrame(self.country_codes, columns=["country_code"]),
                pd.DataFrame(self.year_range, columns=["year"]),
                how="cross",
            )
        return self._df_years_and_country_codes.copy()

    def get_power_grid_prices(self):
        pgp_ref_list = [
            (year, cc, price)
            for (year, cc), price in self.power_grid_prices_ref.items()
        ]
        df = pd.DataFrame(pgp_ref_list, columns=("year", "country_code", "price"))
        df["material_category"] = "Electricity"
        return (df,)

    def get_hydrogen_prices(self):
        h2_ref_list = [
            (year, cc, price) for (year, cc), price in self.h2_prices_ref.items()
        ]
        df = pd.DataFrame(h2_ref_list, columns=("year", "country_code", "price"))
        df["material_category"] = "Hydrogen"
        return (df,)

    def get_bio_model_prices(self):
        df_mass = pd.DataFrame(
            (
                (year, country_code, price)
                for (year, country_code), price in bio_model_prices_ref.items()
            ),
            columns=("year", "country_code", "price"),
        )
        df_mass["material_category"] = "Biomass"
        df_methane = df_mass.copy()
        df_methane["material_category"] = "Biomethane"
        return df_mass, df_methane

    def get_store_and_transport_prices(self):
        df_year = pd.DataFrame(self.year_range, columns=["year"])
        df_storage = pd.DataFrame(
            self.ccs_model_storage_ref.items(),
            columns=("country_code", "price_storage"),
        )
        df_transport = pd.DataFrame(
            self.ccs_model_transport_ref.items(),
            columns=("country_code", "price_transport"),
        )
        df_store_trans = pd.merge(df_storage, df_transport, on=("country_code"))
        df_store_trans["price"] = (
            df_store_trans.price_storage + df_store_trans.price_transport
        )
        df_store_trans = df_store_trans.drop(
            ["price_storage", "price_transport"], axis=1
        )
        df_store_trans = df_store_trans.merge(df_year, how="cross")
        df_store_trans_captured = df_store_trans.copy()
        df_store_trans_captured["material_category"] = "Captured CO2"
        df_store_trans_used = df_store_trans.copy()
        df_store_trans_used["material_category"] = "Used CO2"
        return df_store_trans_captured, df_store_trans_used

    def get_gas_prices_per_country_and_year(self, gas_type, country_codes):
        gas_prices = self.static_energy_prices.loc[gas_type].reset_index().copy()
        year_to_price = dict(zip(gas_prices.Year.values, gas_prices.Value.values))
        default_price = year_to_price[
            2026
        ]  # FIXME make this depend on the available range of years
        df_data = {"year": [], "price": []}
        for year in self.year_range:
            df_data["year"].append(year)
            df_data["price"].append(year_to_price.get(year, default_price))
        df = pd.DataFrame(df_data)
        return pd.merge(
            df, pd.DataFrame(country_codes, columns=["country_code"]), how="cross"
        )

    def get_gas_prices(self):
        low_natural_gas_cc = [
            country_code
            for country_code, flag in self.steel_plant_region_ng_dict.items()
            if flag == 1
        ]
        high_natural_gas_cc = [
            country_code
            for country_code, flag in self.steel_plant_region_ng_dict.items()
            if flag == 0
        ]
        df_gas_low = self.get_gas_prices_per_country_and_year(
            "Natural gas - low", low_natural_gas_cc
        )
        df_gas_low["material_category"] = "Natural gas"
        df_gas_high = self.get_gas_prices_per_country_and_year(
            "Natural gas - high", high_natural_gas_cc
        )
        df_gas_high["material_category"] = "Natural gas"
        return df_gas_low, df_gas_high

    def get_plastic_waste(self):
        df = self.create_df_from_years_and_contry_codes()
        df["price"] = feedstock_dict["Plastic waste"]
        df["material_category"] = "Plastic waste"
        return (df,)

    def get_fossil_category_prices(self, category):
        fossil_prices = self.static_energy_prices.loc[category].reset_index().copy()
        year_to_price = dict(zip(fossil_prices.Year.values, fossil_prices.Value.values))
        default_price = year_to_price[
            2026
        ]  # FIXME make this depend on the available range of years
        data = {"year": [], "price": []}
        for year in self.year_range:
            data["year"].append(year)
            data["price"].append(year_to_price.get(year, default_price))
        df = pd.DataFrame(data)
        df["material_category"] = category
        return pd.merge(
            df, pd.DataFrame(self.country_codes, columns=["country_code"]), how="cross"
        )

    def get_fossil_fuel_prices(self):
        fossil_categories = [
            k
            for k, v in self.resource_category_mapper.items()
            if v == "Fossil Fuels" and k not in ("Natural gas", "Plastic waste")
        ]
        fossil_price_dfs = []
        for category in fossil_categories:
            fossil_price_dfs.append(self.get_fossil_category_prices(category))
        return fossil_price_dfs

    def get_feedstock_prices(self):
        df_iron_ore = self.create_df_from_years_and_contry_codes()
        df_iron_ore["price"] = feedstock_dict["Iron ore"]
        df_iron_ore["material_category"] = "Iron ore"
        df_scrap = self.create_df_from_years_and_contry_codes()
        df_scrap["price"] = feedstock_dict["Scrap"]
        df_scrap["material_category"] = "Scrap"
        df_dri = self.create_df_from_years_and_contry_codes()
        df_dri["price"] = feedstock_dict["DRI"]
        df_dri["material_category"] = "DRI"
        return df_iron_ore, df_scrap, df_dri

    def get_other_opex(self):
        df_bf_slag = self.create_df_from_years_and_contry_codes()
        df_bf_slag["price"] = feedstock_dict["BF slag"]
        df_bf_slag["material_category"] = "BF slag"
        df_other_slag = self.create_df_from_years_and_contry_codes()
        df_other_slag["price"] = feedstock_dict["Other slag"]
        df_other_slag["material_category"] = "Other slag"
        return df_bf_slag, df_other_slag

    def get_steam_prices(self):
        df_steam = self.get_gas_prices_per_country_and_year("Steam", self.country_codes)
        df_steam["material_category"] = "Steam"
        return (df_steam,)

    def get_price_lookup_df(self):
        price_lookup_dfs = [
            *self.get_power_grid_prices(),
            *self.get_hydrogen_prices(),
            *self.get_bio_model_prices(),
            *self.get_store_and_transport_prices(),
            *self.get_gas_prices(),
            *self.get_plastic_waste(),
            *self.get_fossil_fuel_prices(),
            *self.get_feedstock_prices(),
            *self.get_other_opex(),
            *self.get_steam_prices(),
        ]
        return merge_price_lookup_dfs(price_lookup_dfs)

    def get_base_data_frame(self):
        dyc = pd.DataFrame(
            self.product_range_year_country, columns=("year", "country_code")
        )
        emissions = set(
            [k for k, v in self.resource_category_mapper.items() if v == "Emissivity"]
        )
        db = self.business_cases.copy()
        db = db[~db.material_category.isin(emissions)]
        df = db.merge(dyc, how="cross")
        not_categorical = set(["value"])
        categorical_columns = [col for col in df.columns if col not in not_categorical]
        for col in categorical_columns:
            df[col] = df[col].astype("category")
        df["cost"] = 0.0
        return df

    def plant_variable_costs(self):
        df = self.get_base_data_frame()
        df_prices = self.get_price_lookup_df()
        dm = df.merge(
            df_prices, on=("material_category", "year", "country_code"), how="left"
        )
        dm["country_code"] = dm["country_code"].astype("category")
        dm["material_category"] = dm["material_category"].astype("category")
        dm["cost"] = dm.value * dm.price
        dm["cost"] = dm.cost.fillna(0.0)
        return dm

In [126]:
mi = ModelInput.from_filesystem()

In [127]:
%%time
dm = mi.plant_variable_costs()

CPU times: user 951 ms, sys: 137 ms, total: 1.09 s
Wall time: 1.09 s


# Build Price Lookup DataFrames From Dicts

## Helper Functions

In [57]:
def create_df_from_years_and_contry_codes(year_range, country_codes):
    return pd.merge(
        pd.DataFrame(country_codes, columns=["country_code"]),
        pd.DataFrame(year_range, columns=["year"]),
        how="cross",
    )


def convert_to_category(*args, columns=["material_category", "country_code", "year"]):
    for df in args:
        for col in columns:
            df[col] = df[col].astype("category")
    return args


def merge_price_lookup_dfs(price_lookup_dfs):
    df = pd.concat(price_lookup_dfs)
    df = convert_to_category(df)[0]
    return df

## Power Grid Prices

In [58]:
def get_power_grid_prices(power_grid_prices_ref):
    pgp_ref_list = [
        (year, cc, price) for (year, cc), price in power_grid_prices_ref.items()
    ]
    df = pd.DataFrame(pgp_ref_list, columns=("year", "country_code", "price"))
    df["material_category"] = "Electricity"
    return (df,)

## Hydrogen Prices

In [59]:
def get_hydrogen_prices(h2_prices_ref):
    h2_ref_list = [(year, cc, price) for (year, cc), price in h2_prices_ref.items()]
    df = pd.DataFrame(h2_ref_list, columns=("year", "country_code", "price"))
    df["material_category"] = "Hydrogen"
    return (df,)

## Bio Model Prices

In [60]:
def get_bio_model_prices(bio_model_prices_ref):
    df_mass = pd.DataFrame(
        (
            (year, country_code, price)
            for (year, country_code), price in bio_model_prices_ref.items()
        ),
        columns=("year", "country_code", "price"),
    )
    df_mass["material_category"] = "Biomass"
    df_methane = df_mass.copy()
    df_methane["material_category"] = "Biomethane"
    return df_mass, df_methane

## CCS Transport / Storage Prices

In [61]:
def get_store_and_transport_prices(
    year_range, ccs_model_storage_ref, ccs_model_transport_ref
):
    df_year = pd.DataFrame(year_range, columns=["year"])
    df_storage = pd.DataFrame(
        ccs_model_storage_ref.items(), columns=("country_code", "price_storage")
    )
    df_transport = pd.DataFrame(
        ccs_model_transport_ref.items(), columns=("country_code", "price_transport")
    )
    df_store_trans = pd.merge(df_storage, df_transport, on=("country_code"))
    df_store_trans["price"] = (
        df_store_trans.price_storage + df_store_trans.price_transport
    )
    df_store_trans = df_store_trans.drop(["price_storage", "price_transport"], axis=1)
    df_store_trans = df_store_trans.merge(df_year, how="cross")
    df_store_trans_captured = df_store_trans.copy()
    df_store_trans_captured["material_category"] = "Captured CO2"
    df_store_trans_used = df_store_trans.copy()
    df_store_trans_used["material_category"] = "Used CO2"
    return df_store_trans_captured, df_store_trans_used

## Static Energy Prices

In [62]:
def get_gas_prices_per_country_and_year(
    year_range, static_energy_prices, gas_type, country_codes
):
    gas_prices = static_energy_prices.loc[gas_type].reset_index().copy()
    year_to_price = dict(zip(gas_prices.Year.values, gas_prices.Value.values))
    default_price = year_to_price[
        2026
    ]  # FIXME make this depend on the available range of years
    df_data = {"year": [], "price": []}
    for year in year_range:
        df_data["year"].append(year)
        df_data["price"].append(year_to_price.get(year, default_price))
    df = pd.DataFrame(df_data)
    return pd.merge(
        df, pd.DataFrame(country_codes, columns=["country_code"]), how="cross"
    )


def get_gas_prices(year_range, steel_plant_region_ng_dict, static_energy_prices):
    low_natural_gas_cc = [
        country_code
        for country_code, flag in steel_plant_region_ng_dict.items()
        if flag == 1
    ]
    high_natural_gas_cc = [
        country_code
        for country_code, flag in steel_plant_region_ng_dict.items()
        if flag == 0
    ]
    df_gas_low = get_gas_prices_per_country_and_year(
        year_range, static_energy_prices, "Natural gas - low", low_natural_gas_cc
    )
    df_gas_low["material_category"] = "Natural gas"
    df_gas_high = get_gas_prices_per_country_and_year(
        year_range, static_energy_prices, "Natural gas - high", high_natural_gas_cc
    )
    df_gas_high["material_category"] = "Natural gas"
    return df_gas_low, df_gas_high

## Plastic Waste Prices

In [63]:
def get_plastic_waste(year_range, feedstock_dict, country_codes):
    df = create_df_from_years_and_contry_codes(year_range, country_codes)
    df["price"] = feedstock_dict["Plastic waste"]
    df["material_category"] = "Plastic waste"
    return (df,)

## Fossil Fuel Prices

In [64]:
def get_fossil_category_prices(static_energy_prices, category, country_codes):
    fossil_prices = static_energy_prices.loc[category].reset_index().copy()
    year_to_price = dict(zip(fossil_prices.Year.values, fossil_prices.Value.values))
    default_price = year_to_price[
        2026
    ]  # FIXME make this depend on the available range of years
    data = {"year": [], "price": []}
    for year in MODEL_YEAR_RANGE:
        data["year"].append(year)
        data["price"].append(year_to_price.get(year, default_price))
    df = pd.DataFrame(data)
    df["material_category"] = category
    # df["material_category"] = pd.Categorical([category] * df.shape[0])
    return pd.merge(
        df, pd.DataFrame(country_codes, columns=["country_code"]), how="cross"
    )


def get_fossil_fuel_prices(
    resource_category_mapper, static_energy_prices, country_codes
):
    fossil_categories = [
        k
        for k, v in resource_category_mapper.items()
        if v == "Fossil Fuels" and k not in ("Natural gas", "Plastic waste")
    ]
    fossil_price_dfs = []
    for category in fossil_categories:
        fossil_price_dfs.append(
            get_fossil_category_prices(static_energy_prices, category, country_codes)
        )
    return fossil_price_dfs
    # return convert_to_category(*fossil_price_dfs)

## Feedstock / Raw Material

In [97]:
def get_feedstock_prices(year_range, country_codes, feedstock_dict):
    df_iron_ore = create_df_from_years_and_contry_codes(year_range, country_codes)
    df_iron_ore["price"] = feedstock_dict["Iron ore"]
    df_iron_ore["material_category"] = "Iron ore"
    df_scrap = create_df_from_years_and_contry_codes(year_range, country_codes)
    df_scrap["price"] = feedstock_dict["Scrap"]
    df_scrap["material_category"] = "Scrap"
    df_dri = create_df_from_years_and_contry_codes(year_range, country_codes)
    df_dri["price"] = feedstock_dict["DRI"]
    df_dri["material_category"] = "DRI"
    return df_iron_ore, df_scrap, df_dri

## Other Opex

In [66]:
def get_other_opex(year_range, feedstock_dict, country_codes):
    df_bf_slag = create_df_from_years_and_contry_codes(year_range, country_codes)
    df_bf_slag["price"] = feedstock_dict["BF slag"]
    df_bf_slag["material_category"] = "BF slag"
    df_other_slag = create_df_from_years_and_contry_codes(year_range, country_codes)
    df_other_slag["price"] = feedstock_dict["Other slag"]
    df_other_slag["material_category"] = "Other slag"
    return df_bf_slag, df_other_slag

## Steam

In [67]:
def get_steam_prices(year_range, static_energy_prices, country_codes):
    df_steam = get_gas_prices_per_country_and_year(
        year_range, static_energy_prices, "Steam", country_codes
    )
    df_steam["material_category"] = "Steam"
    return (df_steam,)

## Merge Price DataFrames

In [68]:
def get_price_lookup_df(
    power_grid_prices_ref,
    h2_prices_ref,
    bio_model_prices_ref,
    year_range,
    ccs_model_storage_ref,
    ccs_model_transport_ref,
    steel_plant_region_ng_dict,
    static_energy_prices,
    feedstock_dict,
    steel_plant_country_codes,
    resource_category_mapper,
):
    price_lookup_dfs = [
        *get_power_grid_prices(power_grid_prices_ref),
        *get_hydrogen_prices(h2_prices_ref),
        *get_bio_model_prices(bio_model_prices_ref),
        *get_store_and_transport_prices(
            year_range, ccs_model_storage_ref, ccs_model_transport_ref
        ),
        *get_gas_prices(year_range, steel_plant_region_ng_dict, static_energy_prices),
        *get_plastic_waste(year_range, feedstock_dict, steel_plant_country_codes),
        *get_fossil_fuel_prices(
            resource_category_mapper, static_energy_prices, steel_plant_country_codes
        ),
        *get_feedstock_prices(year_range, steel_plant_country_codes, feedstock_dict),
        *get_other_opex(year_range, feedstock_dict, steel_plant_country_codes),
        *get_steam_prices(year_range, static_energy_prices, steel_plant_country_codes),
    ]
    return merge_price_lookup_dfs(price_lookup_dfs)

In [69]:
%%time
df_prices = get_price_lookup_df(
    power_grid_prices_ref,
    h2_prices_ref,
    bio_model_prices_ref,
    MODEL_YEAR_RANGE,
    ccs_model_storage_ref,
    ccs_model_transport_ref,
    steel_plant_region_ng_dict,
    static_energy_prices,
    feedstock_dict,
    steel_plant_country_codes,
    RESOURCE_CATEGORY_MAPPER,
)

CPU times: user 121 ms, sys: 7.09 ms, total: 128 ms
Wall time: 127 ms


In [70]:
df_prices.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79174 entries, 0 to 2355
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   year               79174 non-null  category
 1   country_code       79174 non-null  category
 2   price              79174 non-null  float64 
 3   material_category  79174 non-null  category
dtypes: category(3), float64(1)
memory usage: 1.5 MB


# Base DataFrame

In [71]:
def get_base_data_frame(
    product_range_year_country, resource_category_mapper, business_cases
):
    dyc = pd.DataFrame(product_range_year_country, columns=("year", "country_code"))
    emissions = set(
        [k for k, v in resource_category_mapper.items() if v == "Emissivity"]
    )
    db = business_cases.copy()
    db = db[~db.material_category.isin(emissions)]
    df = db.merge(dyc, how="cross")
    not_categorical = set(["value"])
    categorical_columns = [col for col in df.columns if col not in not_categorical]
    for col in categorical_columns:
        df[col] = df[col].astype("category")
    df["cost"] = 0.0
    return df

In [116]:
def get_base_data_frame_mi(mi):
    dyc = pd.DataFrame(mi.product_range_year_country, columns=("year", "country_code"))
    emissions = set(
        [k for k, v in mi.resource_category_mapper.items() if v == "Emissivity"]
    )
    db = mi.business_cases.copy()
    db = db[~db.material_category.isin(emissions)]
    df = db.merge(dyc, how="cross")
    not_categorical = set(["value"])
    categorical_columns = [col for col in df.columns if col not in not_categorical]
    for col in categorical_columns:
        df[col] = df[col].astype("category")
    df["cost"] = 0.0
    return df

## New plant_variable_costs Function

In [117]:
def plant_variable_costs_newest():
    mi = ModelInput.from_filesystem()
    df = get_base_data_frame_mi(mi)
    df_prices = mi.get_price_lookup_df()
    dm = df.merge(
        df_prices, on=("material_category", "year", "country_code"), how="left"
    )
    dm["country_code"] = dm["country_code"].astype("category")
    dm["material_category"] = dm["material_category"].astype("category")
    dm["cost"] = dm.value * dm.price
    dm["cost"] = dm.cost.fillna(0.0)
    return dm

In [72]:
def plant_variable_costs_new(
    product_range_year_country,
    resource_category_mapper,
    business_cases,
    power_grid_prices_ref,
    h2_prices_ref,
    bio_model_prices_ref,
    year_range,
    ccs_model_storage_ref,
    ccs_model_transport_ref,
    steel_plant_region_ng_dict,
    static_energy_prices,
    feedstock_dict,
    steel_plant_country_codes,
):
    df = get_base_data_frame(
        product_range_year_country, resource_category_mapper, business_cases
    )
    df_prices = get_price_lookup_df(
        power_grid_prices_ref,
        h2_prices_ref,
        bio_model_prices_ref,
        year_range,
        ccs_model_storage_ref,
        ccs_model_transport_ref,
        steel_plant_region_ng_dict,
        static_energy_prices,
        feedstock_dict,
        steel_plant_country_codes,
        resource_category_mapper,
    )
    dm = df.merge(
        df_prices, on=("material_category", "year", "country_code"), how="left"
    )
    dm["country_code"] = dm["country_code"].astype("category")
    dm["material_category"] = dm["material_category"].astype("category")
    dm["cost"] = dm.value * dm.price
    dm["cost"] = dm.cost.fillna(0.0)
    return dm

In [24]:
df = get_base_data_frame(
    product_range_year_country, RESOURCE_CATEGORY_MAPPER, business_cases
)

In [25]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1088472 entries, 0 to 1088471
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype   
---  ------             --------------    -----   
 0   technology         1088472 non-null  category
 1   material_category  1088472 non-null  category
 2   metric_type        1088472 non-null  category
 3   unit               1088472 non-null  category
 4   value              1088472 non-null  float64 
 5   year               1088472 non-null  category
 6   country_code       1088472 non-null  category
 7   cost               1088472 non-null  float64 
dtypes: category(6), float64(2)
memory usage: 31.2 MB


In [73]:
%%time
dm = plant_variable_costs_new(
    product_range_year_country,
    RESOURCE_CATEGORY_MAPPER,
    business_cases,
    power_grid_prices_ref,
    h2_prices_ref,
    bio_model_prices_ref,
    MODEL_YEAR_RANGE,
    ccs_model_storage_ref,
    ccs_model_transport_ref,
    steel_plant_region_ng_dict,
    static_energy_prices,
    feedstock_dict,
    steel_plant_country_codes,
)

CPU times: user 936 ms, sys: 119 ms, total: 1.06 s
Wall time: 1.06 s


In [118]:
%%time
dm = plant_variable_costs_newest()

CPU times: user 965 ms, sys: 126 ms, total: 1.09 s
Wall time: 1.09 s


In [119]:
dm.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1088472 entries, 0 to 1088471
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype   
---  ------             --------------    -----   
 0   technology         1088472 non-null  category
 1   material_category  1088472 non-null  category
 2   metric_type        1088472 non-null  category
 3   unit               1088472 non-null  category
 4   value              1088472 non-null  float64 
 5   year               1088472 non-null  category
 6   country_code       1088472 non-null  category
 7   cost               1088472 non-null  float64 
 8   price              1036640 non-null  float64 
dtypes: category(6), float64(3)
memory usage: 39.5 MB


In [120]:
query_str = "material_category == 'Electricity' and technology == 'DRI-Melt-BOF+CCUS' and year == 2050 and country_code == 'USA'"
dm.query(query_str)

Unnamed: 0,technology,material_category,metric_type,unit,value,year,country_code,cost,price
925907,DRI-Melt-BOF+CCUS,Electricity,Purchased energy,GJ/t steel,3.50132,2050,USA,59.091243,16.87685


# Compare Optimized Version with Reference

In [29]:
df_reference.shape

(1088472, 9)

In [30]:
dm.shape

(1088472, 9)

In [31]:
df_reference.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1088472 entries, 0 to 1192135
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   technology         1088472 non-null  object 
 1   material_category  1088472 non-null  object 
 2   metric_type        1088472 non-null  object 
 3   unit               1088472 non-null  object 
 4   value              1088472 non-null  float64
 5   year               1088472 non-null  int64  
 6   country_code       1088472 non-null  object 
 7   cost               1088472 non-null  float64
 8   cost_type          1088472 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 443.3 MB


In [32]:
df_reference.material_category.unique()

array(['Iron ore', 'Scrap', 'DRI', 'Met coal', 'Coke', 'Thermal coal',
       'BF gas', 'COG', 'BOF gas', 'Natural gas', 'Plastic waste',
       'Biomass', 'Biomethane', 'Hydrogen', 'Electricity', 'Steam',
       'BF slag', 'Other slag', 'Captured CO2', 'Used CO2', 'Emissivity'],
      dtype=object)

In [76]:
def build_pk(df):
    df["year"] = df["year"].astype("str")
    pk_columns = ["material_category", "technology", "country_code", "year"]
    df["pk"] = df[pk_columns].agg("_".join, axis=1)
    return df.drop(pk_columns, axis=1).set_index("pk").sort_index()


def build_comparable_dataframes(category, df_reference, df_optimized):
    dfr = (
        df_reference[df_reference.material_category == category]
        .copy()
        .drop("cost_type", axis=1)
        .reset_index(drop=True)
    )
    dmo = (
        df_optimized[df_optimized.material_category == category]
        .copy()
        .drop("price", axis=1)
        .reset_index(drop=True)
    )
    return build_pk(dfr), build_pk(dmo)


def compare_by_material_category(category, df_reference, df_optimized):
    dfr, dmo = build_comparable_dataframes(category, df_reference, df_optimized)
    print("Category: ", category)
    print("shape: ", dfr.shape, dmo.shape, dfr.shape == dmo.shape)
    print("column values: ", np.all((dfr == dmo).all().values))
    print("sum cost: ", dfr.cost.sum(), dmo.cost.sum())
    print("-----------------------------")

In [34]:
compare_by_material_category("Hydrogen", df_reference, dm)

Category:  Hydrogen
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  1149369.2660721818 1149369.2660721818
-----------------------------


In [35]:
compare_by_material_category("Iron ore", df_reference, dm)

Category:  Iron ore
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  6224864.016437758 6224864.016437758
-----------------------------


In [128]:
for category in df_reference.material_category.unique():
    compare_by_material_category(category, df_reference, dm)

Category:  Iron ore
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  6224864.016437758 6224864.016437758
-----------------------------
Category:  Scrap
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  2309189.8733712 2309189.8733712
-----------------------------
Category:  DRI
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  0.0 0.0
-----------------------------
Category:  Met coal
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  563174.7219761774 563174.7219761774
-----------------------------
Category:  Coke
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  0.0 0.0
-----------------------------
Category:  Thermal coal
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  284071.0588329466 284071.0588329466
-----------------------------
Category:  BF gas
shape:  (51832, 4) (51832, 4) True
column values:  True
sum cost:  0.0 0.0
-----------------------------
Category:  COG
shape:  (518