In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

In [None]:
vol = pd.read_csv('data/train/df_volume_train.csv')
gxs = pd.read_csv('data/train/df_generics_train.csv')
info = pd.read_csv('data/train/df_medicine_info_train.csv')

In [None]:
df = vol.merge(
    gxs,
    on=['country', 'brand_name', 'months_postgx'],
    how='left',
    validate='one_to_one'
)
df = df.merge(
    info,
    on=['country', 'brand_name'],
    how='left',
    validate='many_to_one'
)

In [None]:
# Compute Avgj
avgj = (
    df[df["months_postgx"].between(-12, -1)]
    .groupby(["country", "brand_name"])["volume"]
    .mean()
    .rename("Avgj")
)

df = df.merge(avgj, on=["country", "brand_name"], how="left", validate="many_to_one")

In [None]:
df["target_norm"] = df["volume"] / df["Avgj"]

In [None]:
pre = df[df["months_postgx"] < 0]

pre_stats = pre.groupby(["country", "brand_name"])["target_norm"].agg(
    pre_mean="mean",
    pre_std="std",
    pre_min="min",
    pre_max="max",
    pre_trend=lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x)>1 else 0
)

df = df.merge(pre_stats, on=["country", "brand_name"], how="left", validate="many_to_one")

In [None]:
windows = {
    "t1": (-24, -22),
    "t2": (-21, -19),
    "t3": (-18, -16),
    "t4": (-15, -13),
    "t5": (-12, -10),
    "t6": (-9, -7),
    "t7": (-6, -4),
    "t8": (-3, -1),
}

trimester_frames = []

for name, (start, end) in windows.items():
    tmp = (
        df[df["months_postgx"].between(start, end)]
        .groupby(["country", "brand_name"])["target_norm"]
        .agg(
            **{
                f"{name}_mean": "mean",
                f"{name}_std": "std",
                f"{name}_min": "min",
                f"{name}_max": "max",
                f"{name}_trend": lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x)>1 else 0,
            }
        )
    )
    trimester_frames.append(tmp)

# Combine trimester stats
trimester_stats = pd.concat(trimester_frames, axis=1)

# Merge
df = df.merge(trimester_stats, on=["country", "brand_name"], how="left")

In [None]:
df['months_postgx_sin'] = np.sin(df['months_postgx']* (2.*np.pi/12))
df['months_postgx_cos'] = np.cos(df['months_postgx']* (2.*np.pi/12))

# Split

In [None]:
# Unique pairs
pairs = df[["country", "brand_name"]].drop_duplicates()

train_pairs, eval_pairs = train_test_split(
    pairs,
    test_size=0.3,
    random_state=42,
    shuffle=True
)

eval_pairs_s1, eval_pairs_s2 = train_test_split(
    eval_pairs,
    test_size=0.5,
    random_state=42,
    shuffle=True
)

# Build train and eval dataframes
train_df = df.merge(train_pairs, on=["country", "brand_name"])
eval_df_s1 = df.merge(eval_pairs_s1, on=["country", "brand_name"])
eval_df_s2 = df.merge(eval_pairs_s2, on=["country", "brand_name"])

In [None]:
train_df.shape, eval_df_s1.shape, eval_df_s2.shape

In [None]:
eval_df_s1_true = eval_df_s1[eval_df_s1['months_postgx'] >= 0][['country', 'brand_name', 'months_postgx', 'volume']]
eval_df_s1.loc[eval_df_s1['months_postgx'] >= 0, 'volume'] = np.nan
eval_df_s1.loc[eval_df_s1['months_postgx'] >= 0, 'target_norm'] = np.nan
eval_df_s1["lag1"] = np.nan
eval_df_s1["lag2"] = np.nan
eval_df_s1["lag3"] = np.nan
eval_df_s1["roll5_mean"] = np.nan
eval_df_s1["roll5_std"]  = np.nan
eval_df_s1["pred"] = np.nan

eval_df_s2_true = eval_df_s2[eval_df_s2['months_postgx'] >= 6][['country', 'brand_name', 'months_postgx', 'volume']]
eval_df_s2.loc[eval_df_s2['months_postgx'] >= 6, 'volume'] = np.nan
eval_df_s2.loc[eval_df_s2['months_postgx'] >= 6, 'target_norm'] = np.nan
eval_df_s2["lag1"] = np.nan
eval_df_s2["lag2"] = np.nan
eval_df_s2["lag3"] = np.nan
eval_df_s2["roll5_mean"] = np.nan
eval_df_s2["roll5_std"]  = np.nan
eval_df_s2["pred"] = np.nan

# Introducing lags

In [None]:
train_df["lag1"] = train_df.groupby(["country", "brand_name"])["target_norm"].shift(1)
train_df["lag2"] = train_df.groupby(["country", "brand_name"])["target_norm"].shift(2)
train_df["lag3"] = train_df.groupby(["country", "brand_name"])["target_norm"].shift(3)

train_df['roll5_mean'] = train_df.groupby(["country", "brand_name"])["target_norm"].rolling(5).mean().reset_index()['target_norm']
train_df['roll5_std'] = train_df.groupby(["country", "brand_name"])["target_norm"].rolling(5).std().reset_index()['target_norm']

In [None]:
train_df.columns

In [None]:
cat_features = [
    'month', 'months_postgx', 'ther_area', 'main_package',
    'biological', 'small_molecule'
]

num_features = [
    'n_gxs', 'hospital_rate',
    'Avgj', 'pre_mean', 'pre_std',
    'pre_min', 'pre_max', 'pre_trend', 't1_mean', 't1_std', 't1_min',
    't1_max', 't1_trend', 't2_mean', 't2_std', 't2_min', 't2_max',
    't2_trend', 't3_mean', 't3_std', 't3_min', 't3_max', 't3_trend',
    't4_mean', 't4_std', 't4_min', 't4_max', 't4_trend', 't5_mean',
    't5_std', 't5_min', 't5_max', 't5_trend', 't6_mean', 't6_std', 't6_min',
    't6_max', 't6_trend', 't7_mean', 't7_std', 't7_min', 't7_max',
    't7_trend', 't8_mean', 't8_std', 't8_min', 't8_max', 't8_trend',
    'months_postgx_sin', 'months_postgx_cos', 'lag1', 'lag2', 'lag3',
    'roll5_mean', 'roll5_std'
]

features = cat_features + num_features

In [None]:
train_pool_s1 = Pool(
    data=train_df[train_df['months_postgx'] >= 0][features],
    label=train_df[train_df['months_postgx'] >= 0]['target_norm'],
    cat_features=[features.index(c) for c in cat_features]
)

model_s1 = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    verbose=200
)

model_s1.fit(train_pool_s1)


In [None]:
train_pool_s2 = Pool(
    data=train_df[train_df['months_postgx'] >= 6][features],
    label=train_df[train_df['months_postgx'] >= 6]['target_norm'],
    cat_features=[features.index(c) for c in cat_features]
)

model_s2 = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    verbose=200
)

model_s2.fit(train_pool_s2)


In [None]:
preds = []

groups = eval_df_s1.groupby(["country", "brand_name"])

for (country, brand), g in groups:

    g = g.sort_values("months_postgx").copy()

    history = g[g['months_postgx'].isin(range(-5, 0))]["target_norm"].tolist()

    for idx, row in g.iterrows():
        if row['months_postgx'] < 0:
            continue

        # ----- 1. Insert lag features -----
        g.loc[idx, "lag1"] = history[-1]
        g.loc[idx, "lag2"] = history[-2]
        g.loc[idx, "lag3"] = history[-3]

        # ----- 2. Compute rolling features -----
        g.loc[idx, "roll5_mean"] = np.mean(history[-5:])
        g.loc[idx, "roll5_std"]  = np.std(history[-5:])

        # ----- 3. Prepare row for prediction -----
        X_row = g.loc[idx, features]

        # Predict
        pred = model_s1.predict(X_row.values.reshape(1, -1))[0]

        # Save prediction
        history.append(pred)
        g.loc[idx, "pred"] = pred

    preds.append(g)

# Combine predictions
eval_pred_df_s1 = pd.concat(preds)
eval_pred_df_s1 = eval_pred_df_s1.sort_index()
eval_pred_df_s1 = eval_pred_df_s1[eval_pred_df_s1['months_postgx'] >= 0].drop(['volume'], axis=1)
eval_pred_df_s1 = eval_pred_df_s1.merge(eval_df_s1_true, on=["country", "brand_name", "months_postgx"])


In [None]:
preds = []

groups = eval_df_s2.groupby(["country", "brand_name"])

for (country, brand), g in groups:

    g = g.sort_values("months_postgx").copy()

    history = g[g['months_postgx'].isin(range(1, 6))]["target_norm"].tolist()

    for idx, row in g.iterrows():
        if row['months_postgx'] < 6:
            continue

        # ----- 1. Insert lag features -----
        g.loc[idx, "lag1"] = history[-1]
        g.loc[idx, "lag2"] = history[-2]
        g.loc[idx, "lag3"] = history[-3]

        # ----- 2. Compute rolling features -----
        g.loc[idx, "roll5_mean"] = np.mean(history[-5:])
        g.loc[idx, "roll5_std"]  = np.std(history[-5:])

        # ----- 3. Prepare row for prediction -----
        X_row = g.loc[idx, features]

        # Predict
        pred = model_s2.predict(X_row.values.reshape(1, -1))[0]

        # Save prediction
        history.append(pred)
        g.loc[idx, "pred"] = pred

    preds.append(g)

# Combine predictions
eval_pred_df_s2 = pd.concat(preds)
eval_pred_df_s2 = eval_pred_df_s2.sort_index()
eval_pred_df_s2 = eval_pred_df_s2[eval_pred_df_s2['months_postgx'] >= 6].drop(['volume'], axis=1)
eval_pred_df_s2 = eval_pred_df_s2.merge(eval_df_s2_true, on=["country", "brand_name", "months_postgx"])


In [None]:
def metric_s1(df: pd.DataFrame):
    """
    df must contain columns:
    - target_norm
    - pred
    - months_postgx
    - Avgj
    - country_brand_id  (integer id for grouping)
    """

    results = []

    for _, g in df.groupby(["country", "brand_name"]):
        avg = g["Avgj"].iloc[0]
        y_true = g["volume"].values
        y_pred = g["pred"].values * avg
        m = g["months_postgx"].values

        # Monthly error (0-23)
        mask_0_23 = (m >= 0) & (m <= 23)
        monthly_err = np.abs(y_true[mask_0_23] - y_pred[mask_0_23]).sum() / (24 * avg)

        # Accumulated error 0–5
        mask_0_5 = (m >= 0) & (m <= 5)
        acc_0_5 = np.abs(y_true[mask_0_5].sum() - y_pred[mask_0_5].sum()) / (6 * avg)

        # Accumulated error 6–11
        mask_6_11 = (m >= 6) & (m <= 11)
        acc_6_11 = np.abs(y_true[mask_6_11].sum() - y_pred[mask_6_11].sum()) / (6 * avg)

        # Accumulated error 12–23
        mask_12_23 = (m >= 12) & (m <= 23)
        acc_12_23 = np.abs(y_true[mask_12_23].sum() - y_pred[mask_12_23].sum()) / (12 * avg)

        # Weighted sum
        pe = (
            0.2 * monthly_err +
            0.5 * acc_0_5 +
            0.2 * acc_6_11 +
            0.1 * acc_12_23
        )

        results.append(pe)

    return np.mean(results)


def metric_s2(df: pd.DataFrame):
    results = []

    for _, g in df.groupby(["country", "brand_name"]):
        avg = g["Avgj"].iloc[0]
        y_true = g["volume"].values
        y_pred = g["pred"].values * avg
        m = g["months_postgx"].values

        # Monthly error (6-23)
        mask_6_23 = (m >= 6) & (m <= 23)
        monthly_err = np.abs(y_true[mask_6_23] - y_pred[mask_6_23]).sum() / (18 * avg)

        # Accumulated error 6–11
        mask_6_11 = (m >= 6) & (m <= 11)
        acc_6_11 = np.abs(y_true[mask_6_11].sum() - y_pred[mask_6_11].sum()) / (6 * avg)

        # Accumulated error 12–23
        mask_12_23 = (m >= 12) & (m <= 23)
        acc_12_23 = np.abs(y_true[mask_12_23].sum() - y_pred[mask_12_23].sum()) / (12 * avg)

        pe = (
            0.2 * monthly_err +
            0.5 * acc_6_11 +
            0.3 * acc_12_23
        )

        results.append(pe)

    return np.mean(results)


In [None]:
print("Scenario 1 metric:", metric_s1(eval_pred_df_s1))
print("Scenario 2 metric:", metric_s2(eval_pred_df_s2))

In [None]:
t_vol = pd.read_csv('data/test/df_volume_test.csv')
t_gxs = pd.read_csv('data/test/df_generics_test.csv')
t_info = pd.read_csv('data/test/df_medicine_info_test.csv')

In [None]:
t_vol.head()

In [None]:
# Extend t_vol to include months_postgx (0 -> 23) with volumn NaN and month continued from last row grouped by "country", "brand_name"
month_to_int = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4,
    "May": 5, "Jun": 6, "Jul": 7, "Aug": 8,
    "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
}

int_to_month = {v: k for k, v in month_to_int.items()}

extended_rows = []

for (country, brand), g in t_vol.groupby(["country", "brand_name"]):

    g = g.sort_values("months_postgx")

    # last pre-gx row (months_postgx = -1)
    last_row = g.iloc[-1]
    last_moth_postgx = last_row["months_postgx"]
    last_month_str = last_row["month"]
    last_month_int = month_to_int[last_month_str]

    # create new rows for horizon months_postgx = 0..23
    for i, h in enumerate(list(range(last_moth_postgx + 1, 24))):

        # wrap month: 1..12
        new_month_int = ((last_month_int + i) % 12) + 1
        new_month_str = int_to_month[new_month_int]

        extended_rows.append({
            "country": country,
            "brand_name": brand,
            "month": new_month_str,
            "months_postgx": h,
            "volume": np.nan
        })

# Build future DF
df_future = pd.DataFrame(extended_rows)

# Combine original + extended rows
t_vol = pd.concat([t_vol, df_future], ignore_index=True)

t_vol = t_vol.sort_values(
    ["country", "brand_name", "months_postgx"]
).reset_index(drop=True)


In [None]:
t_df = t_vol.merge(
    t_gxs,
    on=['country', 'brand_name', 'months_postgx'],
    how='left',
    validate='one_to_one'
)
t_df = t_df.merge(
    t_info,
    on=['country', 'brand_name'],
    how='left',
    validate='many_to_one'
)
# Compute Avgj
avgj = (
    t_df[t_df["months_postgx"].between(-12, -1)]
    .groupby(["country", "brand_name"])["volume"]
    .mean()
    .rename("Avgj")
)

t_df = t_df.merge(avgj, on=["country", "brand_name"], how="left", validate="many_to_one")
t_df["target_norm"] = t_df["volume"] / t_df["Avgj"]
pre = t_df[t_df["months_postgx"] < 0]

pre_stats = pre.groupby(["country", "brand_name"])["target_norm"].agg(
    pre_mean="mean",
    pre_std="std",
    pre_min="min",
    pre_max="max",
    pre_trend=lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x)>1 else 0
)

t_df = t_df.merge(pre_stats, on=["country", "brand_name"], how="left", validate="many_to_one")
windows = {
    "t1": (-24, -22),
    "t2": (-21, -19),
    "t3": (-18, -16),
    "t4": (-15, -13),
    "t5": (-12, -10),
    "t6": (-9, -7),
    "t7": (-6, -4),
    "t8": (-3, -1),
}

trimester_frames = []

for name, (start, end) in windows.items():
    tmp = (
        t_df[t_df["months_postgx"].between(start, end)]
        .groupby(["country", "brand_name"])["target_norm"]
        .agg(
            **{
                f"{name}_mean": "mean",
                f"{name}_std": "std",
                f"{name}_min": "min",
                f"{name}_max": "max",
                f"{name}_trend": lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x)>1 else 0,
            }
        )
    )
    trimester_frames.append(tmp)

# Combine trimester stats
trimester_stats = pd.concat(trimester_frames, axis=1)

# Merge
t_df = t_df.merge(trimester_stats, on=["country", "brand_name"], how="left")
t_df['months_postgx_sin'] = np.sin(t_df['months_postgx']* (2.*np.pi/12))
t_df['months_postgx_cos'] = np.cos(t_df['months_postgx']* (2.*np.pi/12))

In [None]:
max_real = (
    t_df[t_df["volume"].notna()]
    .groupby(["country", "brand_name"])["months_postgx"]
    .max()
    .rename("max_real_month")
)
t_df_with_max = t_df.merge(max_real, on=["country", "brand_name"])
t_df_s1 = t_df_with_max[t_df_with_max["max_real_month"] == -1].copy().drop("max_real_month", axis=1)
t_df_s2 = t_df_with_max[t_df_with_max["max_real_month"] == 5].copy().drop("max_real_month", axis=1)

In [None]:
preds = []

groups = t_df_s1.groupby(["country", "brand_name"])

for (country, brand), g in groups:

    g = g.sort_values("months_postgx").copy()

    history = g[g['months_postgx'].isin(range(-5, 0))]["target_norm"].tolist()

    for idx, row in g.iterrows():
        if row['months_postgx'] < 0:
            continue

        # ----- 1. Insert lag features -----
        g.loc[idx, "lag1"] = history[-1]
        g.loc[idx, "lag2"] = history[-2]
        g.loc[idx, "lag3"] = history[-3]

        # ----- 2. Compute rolling features -----
        g.loc[idx, "roll5_mean"] = np.mean(history[-5:])
        g.loc[idx, "roll5_std"]  = np.std(history[-5:])

        # ----- 3. Prepare row for prediction -----
        X_row = g.loc[idx, features]

        # Predict
        pred = model_s1.predict(X_row.values.reshape(1, -1))[0]

        # Save prediction
        history.append(pred)
        g.loc[idx, "volume"] = pred

    preds.append(g)

# Combine predictions
t_pred_df_s1 = pd.concat(preds)
t_pred_df_s1 = t_pred_df_s1.sort_index()
