In [1]:
%load_ext lab_black

In [2]:
import numpy as np
import pandas as pd

from mppsteel.config.model_scenarios import DEFAULT_SCENARIO
from mppsteel.config.reference_lists import RESOURCE_CATEGORY_MAPPER
from mppsteel.config.model_config import USD_TO_EUR_CONVERSION_DEFAULT, PROJECT_PATH

from mppsteel.data_preprocessing.variable_plant_cost_archetypes import (
    PlantVariableCostsInput,
    build_variable_cost_df,
    plant_variable_costs_vectorized,
    plant_variable_costs_row_wise,
)

# Read the Data from Filesystem

This had to be run at least one time: `python main.py -f`

In [3]:
scenario_dict = DEFAULT_SCENARIO.copy()
scenario_dict["usd_to_eur"] = USD_TO_EUR_CONVERSION_DEFAULT
scenario_dict["eur_to_usd"] = 1.0 / scenario_dict["usd_to_eur"]
pvci = PlantVariableCostsInput.from_filesystem(scenario_dict)

# Highlevel Comparison

## Old Cost Calculation Function

This is the old plant_variable_costs function, calculating the cost for each row individually.

In [4]:
%%time
dr = plant_variable_costs_row_wise(pvci)

Variable Cost Loop: 100%|█████████████████| 2356/2356 [00:43<00:00, 54.01it/s]


CPU times: user 44.1 s, sys: 405 ms, total: 44.5 s
Wall time: 44.4 s


## New Cost Calculation Function

The new function builds up a dataframe containing the price for each material category and then just calculates all costs in one pass.

In [5]:
%%time
dr = plant_variable_costs_vectorized(pvci)

CPU times: user 595 ms, sys: 80.3 ms, total: 675 ms
Wall time: 674 ms


# Lowlevel Comparison

Directly compare iterating over rows with just calculating all at once using [vectorization](https://www.youtube.com/watch?v=nxWginnBklU) (this talk is not 100% correct about everything, but provides a nice overview).

In [7]:
df = build_variable_cost_df(pvci)

In [8]:
def minimal_mapper(row, lookup={}):
    return row.value * lookup.get(row.material_category, 1.0)


CATEGORIES = list(df.material_category.unique())


def mapper_with_multiple_branches(row, categories=CATEGORIES, lookup={}):
    """Simulate the original if/elif cascade"""
    for category in categories:
        if row.material_category == category:
            row.value * lookup.get(row.material_category, 1.0)

## Just Using a Minimal Mapper

To have a comparision between just the for loop with a single dict lookup and the vectorized variant.

In [9]:
%%time
df["cost"] = df.apply(minimal_mapper, axis=1)

CPU times: user 8.03 s, sys: 84.4 ms, total: 8.11 s
Wall time: 8.11 s


## Emulating the if / elif Cascade from the old Function

Doing more in the inner for loop is really slow.

In [10]:
%%time
df["cost"] = df.apply(mapper_with_multiple_branches, axis=1)

CPU times: user 1min 6s, sys: 168 ms, total: 1min 6s
Wall time: 1min 6s


## Vectorized Calculation

This is using [SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) instructions to iterate over the data and is since three orders of magnitudes faster than even the minimal for loop based calculation.

In [19]:
%%time
df["cost"] = df.value * 1.0

CPU times: user 5.1 ms, sys: 4.15 ms, total: 9.25 ms
Wall time: 5.95 ms


It's not about the scalar, using a series is fast, too.

In [24]:
%%time
df["cost"] = df.value * pd.Series(np.ones(df.shape[0]))

CPU times: user 8.8 ms, sys: 6.76 ms, total: 15.6 ms
Wall time: 11.9 ms


# Memory Usage

An additional optimization was to use the [category](https://pandas.pydata.org/docs/user_guide/categorical.html) datatype from pandas instead of the default object which is used if you store string data in a dataframe. This has not much influence on the runtime, but saves about one order of magnitude memory.

In [12]:
def convert_to_category(*args, columns=["material_category", "country_code", "year"]):
    for df in args:
        for col in columns:
            df[col] = df[col].astype("category")
    return args

In [13]:
df = build_variable_cost_df(pvci)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1088472 entries, 0 to 1088471
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype   
---  ------             --------------    -----   
 0   technology         1088472 non-null  category
 1   material_category  1088472 non-null  category
 2   metric_type        1088472 non-null  category
 3   unit               1088472 non-null  category
 4   value              1088472 non-null  float64 
 5   year               1088472 non-null  category
 6   country_code       1088472 non-null  category
 7   cost               1088472 non-null  float64 
dtypes: category(6), float64(2)
memory usage: 31.2 MB


In [14]:
df = build_variable_cost_df(pvci)
for col in ("technology", "material_category", "metric_type", "unit", "country_code"):
    df[col] = df[col].astype(object)
df["year"] = df.year.astype(int)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1088472 entries, 0 to 1088471
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   technology         1088472 non-null  object 
 1   material_category  1088472 non-null  object 
 2   metric_type        1088472 non-null  object 
 3   unit               1088472 non-null  object 
 4   value              1088472 non-null  float64
 5   year               1088472 non-null  int64  
 6   country_code       1088472 non-null  object 
 7   cost               1088472 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 373.9 MB


In [15]:
dr = plant_variable_costs_row_wise(pvci)
dr.info(memory_usage="deep")

Variable Cost Loop: 100%|█████████████████| 2356/2356 [00:43<00:00, 53.59it/s]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1088472 entries, 0 to 1192135
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   technology         1088472 non-null  object 
 1   material_category  1088472 non-null  object 
 2   metric_type        1088472 non-null  object 
 3   unit               1088472 non-null  object 
 4   value              1088472 non-null  float64
 5   year               1088472 non-null  int64  
 6   country_code       1088472 non-null  object 
 7   cost               1088472 non-null  float64
 8   cost_type          1088472 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 443.3 MB
