# Datathon metric implementation using Python

In [1]:
import pandas as pd
import numpy as np

Custom uncertainty metric (forecast intervals metric)

In [2]:
def uncertainty_metric(actuals, upper_bound, lower_bound, avg_volume):
    """
    This function aims to compute the Uncertainty Metric for the
    Novartis Datathon, 3rd edition.

    Given the actuals followed by the upper_bound and lower_bound intervals and the
    average volume, it will compute the metric score.

    Keyword parameters:
        actuals (float vector): Real value of Y
        upper_bound (float vector): upper_bound forecast interval (percentile 95)
        lower_bound (float vector): lower_bound forecast interval (percentile 5)
        avg_volume (float): Average monthly volume of the 12 months
                            prior to the generic entry.

    Returns:
        error_metric: Uncertainty Metric score (%)
    """
    # Assert that all the sizes are OK
    assert (len(lower_bound) == len(upper_bound)) == (len(actuals) == 24), \
        "We should have 24 sorted actuals, upper_bound and lower_bound intervals"

    uncertainty_first6 = (
        # Wide intervals are penalized
        0.85 * sum(abs(upper_bound[:6] - lower_bound[:6])) +
        0.15 * 2 / 0.05 * (
            # If actuals are outside of the intervals, it adds error
            sum((lower_bound[:6] - actuals[:6]) * (actuals[:6] < lower_bound[:6])) +
            sum((actuals[:6] - upper_bound[:6]) * (actuals[:6] > upper_bound[:6]))
        )
    ) / (6 * avg_volume) * 100

    uncertainty_last18 = (
        0.85 * sum(abs(upper_bound[6:] - lower_bound[6:])) +
        0.15 * 2 / 0.05 * (
            sum((lower_bound[6:] - actuals[6:]) * (actuals[6:] < lower_bound[6:])) +
            sum((actuals[6:] - upper_bound[6:]) * (actuals[6:] > upper_bound[6:]))
        )
    ) / (18 * avg_volume) * 100

    return (0.6 * uncertainty_first6 + 0.4 * uncertainty_last18)

In [3]:
def custom_metric(actuals, forecast, avg_volume):
    """
    This function aims to compute the Custom Accuracy Metric
    for the Novartis Datathon, 3rd edition.

    Given the actuals followed by the forecast and the avg_volume
    of the brand, it will compute the metric score.

    Keyword parameters:
        actuals (float vector): Real value of Y
        forecast (float vector): Volume forecast
        avg_volume (float): Average monthly volume of the 12 months
                            prior to the generic entry.

    Returns:
        custom_metric: Uncertainty Metric score (%)
    """

    # Compute the first part of the equation
    # (custom MAPE with Average volume)
    custom_mape = sum(abs(actuals - forecast)) / (24 * avg_volume)

    # Compute the second part of the equation
    # (custom 6-first-months MAPE with Average volume)
    six_month_mape = \
        abs(sum(actuals[:6]) - sum(forecast[:6])) / (6 * avg_volume)

    # Compute the third part of the equation
    # (custom 6-months MAPE with Average volume)
    twelve_month_mape = \
        abs(sum(actuals[6:12]) - sum(forecast[6:12])) / (6 * avg_volume)

    # Compute the fourth part of the equation
    # (custom 12-months MAPE with Average volume)
    last_month_mape = \
        abs(sum(actuals[12:]) - sum(forecast[12:])) / (12 * avg_volume)

    # Compute the custom metric
    custom_metric = 0.5 * custom_mape + 0.3 * six_month_mape + \
        0.1 * (twelve_month_mape + last_month_mape)

    return custom_metric * 100

In [4]:
def apply_metrics(x):
    """
    We are going to apply both metrics to the dataset.
    We need to group the pandas DataFrame by id in order to calculate it.
    IMPORTANT FACT: The metric should only be computed on id's with
                    24 months of data.

    Keyword parameters:
        x (grouped pd.DataFrame): grouped dataset with actuals, forecast,
                                  upper_bound, lower_bound, avg_vol

    Returns:
        pd.Series with metric results

    Example use:
        your_dataframe.groupby(id_col).apply(apply_metrics)
    """
    d = {}
    d["custom_metric"] = custom_metric(
        x["actuals"], x["forecast"], x["avg_vol"].values[0]
    )
    d["uncertainty_metric"] = uncertainty_metric(
        x["actuals"], x["upper_bound"], x["lower_bound"], x["avg_vol"].values[0]
    )

    return pd.Series(d, index=["custom_metric", "uncertainty_metric"])

An example on a mock dataframe (to get used to the metric)

In [2]:
data_dict = {"country": [str(1)] * 24,
             "brand": [str(1)] * 24,
             "actuals": [float(1000)] * 24,
             "forecast": [float(950)] * 24,
             "avg_vol": [10000] * 24,
             "lower_bound": [800] * 24,
             "upper_bound": [1200] * 24,
             "month_num": [i for i in range(24)]}

id_cols = ["country", "brand"]
df = pd.DataFrame(data_dict, columns=[key for key in data_dict.keys()])

In [3]:
df

Unnamed: 0,country,brand,actuals,forecast,avg_vol,lower_bound,upper_bound,month_num
0,1,1,1000.0,950.0,10000,800,1200,0
1,1,1,1000.0,950.0,10000,800,1200,1
2,1,1,1000.0,950.0,10000,800,1200,2
3,1,1,1000.0,950.0,10000,800,1200,3
4,1,1,1000.0,950.0,10000,800,1200,4
5,1,1,1000.0,950.0,10000,800,1200,5
6,1,1,1000.0,950.0,10000,800,1200,6
7,1,1,1000.0,950.0,10000,800,1200,7
8,1,1,1000.0,950.0,10000,800,1200,8
9,1,1,1000.0,950.0,10000,800,1200,9


In [6]:
# Calculate the metric applying the "apply_metrics" function
# (mean value of the absolute error values for the whole dataset)
np.mean(abs(df.groupby(id_cols).apply(apply_metrics)))

custom_metric         0.5
uncertainty_metric    3.4
dtype: float64

Datathon example

In [7]:
# Load data
df_mock = pd.read_csv("gx_volume.csv", index_col=0)

# Define an ID col
id_cols = ["country", "brand"]

In [8]:
df_mock.head()

Unnamed: 0,country,brand,volume,month_num,month_name
1,country_1,brand_3,18509088.6,-88,Jul
2,country_1,brand_3,19697508.0,-87,Aug
3,country_1,brand_3,18315721.8,-86,Sep
4,country_1,brand_3,19831199.4,-85,Oct
5,country_1,brand_3,18593281.8,-84,Nov


In [9]:
avg_12_volume = df_mock[
    (df_mock.month_num >= -12) & (df_mock.month_num < 0)
].groupby(id_cols)["volume"].mean().reset_index()
avg_12_volume

Unnamed: 0,country,brand,volume
0,country_1,brand_10,7.325746e+06
1,country_1,brand_102,3.073476e+07
2,country_1,brand_115,4.462947e+07
3,country_1,brand_117,2.578179e+07
4,country_1,brand_119,2.027448e+08
...,...,...,...
1073,country_8,brand_87,1.653419e+07
1074,country_8,brand_92,7.195820e+06
1075,country_9,brand_167,2.941204e+06
1076,country_9,brand_187,7.768339e+07


In [10]:
# Let's get avg_12 months
avg_12_volume = df_mock[
    (df_mock.month_num >= -12) & (df_mock.month_num < 0)
].groupby(id_cols)["volume"].mean().reset_index()
avg_12_volume = avg_12_volume.rename(columns={"volume": "avg_vol"})
df_mock = pd.merge(df_mock, avg_12_volume, on=id_cols, how="left")

In [11]:
df_mock.head()

Unnamed: 0,country,brand,volume,month_num,month_name,avg_vol
0,country_1,brand_3,18509088.6,-88,Jul,12395405.55
1,country_1,brand_3,19697508.0,-87,Aug,12395405.55
2,country_1,brand_3,18315721.8,-86,Sep,12395405.55
3,country_1,brand_3,19831199.4,-85,Oct,12395405.55
4,country_1,brand_3,18593281.8,-84,Nov,12395405.55


In [12]:
# Using only the future months to make the forecast (mock example)
df_metric = df_mock[
    (
        (
            (df_mock["country"] == "country_8") &
            (df_mock["brand"] == "brand_117")
        ) | (
            (df_mock["country"] == "country_7") &
            (df_mock["brand"] == "brand_5")
        )
    ) & (df_mock["month_num"] >= 0) & (df_mock["month_num"] < 24)
]

In [13]:
# Example forecast and renaming columns to names
df_metric = df_metric.rename(columns={"volume": "actuals"})

# Create mock forecasts
df_metric["forecast"] = \
    df_metric["actuals"] * np.random.normal(1, 0.3, len(df_metric))
df_metric["lower_bound"] = df_metric["forecast"] * 0.85
df_metric["upper_bound"] = df_metric["forecast"] * 1.15

Metric computation (mean of abs in order to get the general dataset metrics)

In [14]:
# Example metric by country brand
df_metric.groupby(id_cols).apply(apply_metrics)

Unnamed: 0_level_0,Unnamed: 1_level_0,custom_metric,uncertainty_metric
country,brand,Unnamed: 2_level_1,Unnamed: 3_level_1
country_7,brand_5,11.146665,54.942479
country_8,brand_117,18.272744,110.959544


In [15]:
# Example metric creation with full mock dataset (train set)
np.mean(abs(df_metric.groupby(id_cols).apply(apply_metrics)))

custom_metric         14.709705
uncertainty_metric    82.951011
dtype: float64