In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score

from src.preprocess import general_preprocessing, train_preprocessing, build_bucket_dataset
from src.train import split_train_test, split_true, split_scenario, split_k_fold, split_bucket_train_test, split_buckets
from src.model import fit_model, predict, fit_model_cv
from src.bucket_classifier import fit_bucket_classifier, predict_bucket1_proba, predict_probas
from src.metrics import compute_metric1, compute_metric2
from src.plot import plot_interactive_comparison

In [3]:
df, df_aux = general_preprocessing(
    pd.read_csv('data/train/df_volume_train.csv'),
    pd.read_csv('data/train/df_generics_train.csv'),
    pd.read_csv('data/train/df_medicine_info_train.csv'),
)

## Classifier Cross Validation

In [None]:
df_bucket_s1 = build_bucket_dataset(df, df_aux, "s1")
df_bucket_s2 = build_bucket_dataset(df, df_aux, "s2")
s1_roc_auc_scores = []
s1_f1_scores = []
for train_bucket_s1_df, eval_bucket_s1_df, _ in split_k_fold(df_bucket_s1):
    bucket_model_s1 = fit_bucket_classifier(train_bucket_s1_df, "s1", verbose=False)
    proba_bucket_s1 = predict_bucket1_proba(bucket_model_s1, eval_bucket_s1_df, "s1")
    pred_bucket_s1 = (proba_bucket_s1 > 0.5).astype(int)

    s1_roc_auc_score = roc_auc_score(eval_bucket_s1_df["label"].values, proba_bucket_s1)
    s1_f1_score = f1_score(eval_bucket_s1_df["label"].values, pred_bucket_s1)
    s1_roc_auc_scores.append(s1_roc_auc_score)
    s1_f1_scores.append(s1_f1_score)

    print("Scenario 1 metrics:", s1_roc_auc_score, s1_f1_score)
print("Mean Scenario 1 metrics:", np.mean(s1_roc_auc_scores), np.mean(s1_f1_scores))

s2_roc_auc_scores = []
s2_f1_scores = []
for train_bucket_s2_df, eval_bucket_s2_df, _ in split_k_fold(df_bucket_s2):
    bucket_model_s2 = fit_bucket_classifier(train_bucket_s2_df, "s2", verbose=False)
    proba_bucket_s2 = predict_bucket1_proba(bucket_model_s2, eval_bucket_s2_df, "s2")
    pred_bucket_s2 = (proba_bucket_s2 > 0.5).astype(int)

    s2_roc_auc_score = roc_auc_score(eval_bucket_s2_df["label"].values, proba_bucket_s2)
    s2_f1_score = f1_score(eval_bucket_s2_df["label"].values, pred_bucket_s2)
    s2_roc_auc_scores.append(s2_roc_auc_score)
    s2_f1_scores.append(s2_f1_score)

    print("Scenario 2 metrics:", s2_roc_auc_score, s2_f1_score)
print("Mean Scenario 2 metrics:", np.mean(s2_roc_auc_scores), np.mean(s2_f1_scores))

## Regressor Cross Validation

In [None]:
metrics_1 = []
metrics_2 = []
for train_df, eval_df_s1, eval_df_s2 in split_k_fold(df):
    eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
    eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

    train_df = train_preprocessing(train_df, df_aux)

    model_s1 = fit_model(train_df, 's1', verbose=False, predict_avg=True)
    model_s2 = fit_model(train_df, 's2', verbose=False, predict_avg=True)

    eval_pred_s1_df = predict(model_s1, eval_df_s1, 0)
    eval_pred_s2_df = predict(model_s2, eval_df_s2, 6)

    metric1_res = compute_metric1(eval_df_s1_true, eval_pred_s1_df, df_aux)
    metric2_res = compute_metric2(eval_df_s2_true, eval_pred_s2_df, df_aux)

    metrics_1.append(metric1_res)
    metrics_2.append(metric2_res)

    print('--------------------')
    print("Scenario 1 metric:", metric1_res)
    print("Scenario 2 metric:", metric2_res)

print('---------------------------------')
print("Mean Scenario 1 metric:", np.mean(metrics_1))
print("Mean Scenario 2 metric:", np.mean(metrics_2))

## Classifier + Regressor Cross Validation

In [None]:
metrics_1 = []
metrics_2 = []
for train_df, eval_df_s1, eval_df_s2 in split_k_fold(df):
    # Remove true values for safety
    eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
    eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

    # Train the classifier and get the votes
    train_bucket_s1_df = build_bucket_dataset(train_df, df_aux, "s1")
    train_bucket_s2_df = build_bucket_dataset(train_df, df_aux, "s2")
    bucket_model_s1 = fit_bucket_classifier(train_bucket_s1_df, "s1", verbose=False)
    bucket_model_s2 = fit_bucket_classifier(train_bucket_s2_df, "s2", verbose=False)
    votes_s1 = predict_probas(bucket_model_s1, eval_df_s1, "s1")
    votes_s2 = predict_probas(bucket_model_s2, eval_df_s2, "s2")

    # Prepare train set according to buckets
    train_df = train_preprocessing(train_df, df_aux)
    train_b1_df, train_b2_df = split_buckets(train_df, df_aux)

    # Train regressors
    model_b1_s1 = fit_model(train_b1_df, 's1', predict_avg=True, verbose=False)
    model_b2_s1 = fit_model(train_b2_df, 's1', predict_avg=True, verbose=False)
    model_b1_s2 = fit_model(train_b1_df, 's2', predict_avg=True, verbose=False)
    model_b2_s2 = fit_model(train_b2_df, 's2', predict_avg=True, verbose=False)

    eval_pred_s1_df = predict([model_b1_s1, model_b2_s1], eval_df_s1, 0, votes_s1)
    eval_pred_s2_df = predict([model_b1_s2, model_b2_s2], eval_df_s2, 6, votes_s2)

    metric1_res = compute_metric1(eval_df_s1_true, eval_pred_s1_df, df_aux)
    metric2_res = compute_metric2(eval_df_s2_true, eval_pred_s2_df, df_aux)

    metrics_1.append(metric1_res)
    metrics_2.append(metric2_res)

    print('--------------------')
    print("Scenario 1 metric:", metric1_res)
    print("Scenario 2 metric:", metric2_res)

print('---------------------------------')
print("Mean Scenario 1 metric:", np.mean(metrics_1))
print("Mean Scenario 2 metric:", np.mean(metrics_2))

In [None]:
# train_bucket_s1_df, eval_bucket_s1_df = split_bucket_train_test(df_bucket_s1)
# bucket_model_s1 = fit_bucket_classifier(train_bucket_s1_df, "s1", verbose=False)
# proba_bucket_s1 = predict_bucket1_proba(bucket_model_s1, eval_bucket_s1_df, "s1")
# pred_bucket_s1 = (proba_bucket_s1 > 0.5).astype(int)
# print("Scenario 1 metrics:", roc_auc_score(eval_bucket_s1_df["label"].values, proba_bucket_s1), f1_score(eval_bucket_s1_df["label"].values, pred_bucket_s1))

# train_bucket_s2_df, eval_bucket_s2_df = split_bucket_train_test(df_bucket_s2)
# bucket_model_s2 = fit_bucket_classifier(train_bucket_s2_df, "s2", verbose=False)
# proba_bucket_s2 = predict_bucket1_proba(bucket_model_s2, eval_bucket_s2_df, "s2")
# pred_bucket_s2 = (proba_bucket_s2 > 0.5).astype(int)
# print("Scenario 2 metrics:", roc_auc_score(eval_bucket_s2_df["label"].values, proba_bucket_s2), f1_score(eval_bucket_s2_df["label"].values, pred_bucket_s2))

In [None]:
# imp = bucket_model_s1.get_feature_importance()
# fi = pd.DataFrame({"feature": bucket_model_s1.feature_names_, "importance": imp}).sort_values("importance", ascending=False)
# print(fi.head(20))

# ths = np.linspace(0.001, 0.1)
# best = None
# for t in ths:
#     pred_bucket_s1 = (proba_bucket_s1 > t).astype(int)
#     f1 = f1_score(eval_bucket_s1_df["label"].values, pred_bucket_s1)
#     if best is None or f1 > best:
#         best = f1
#         best_t = t
# print("Best threshold:", best_t)
# print("Best f1:", best)



# import matplotlib.pyplot as plt

# def plot_probs_vs_buckets(base_df: pd.DataFrame, p_oof: np.ndarray, label_col="label", title=""):
#     dfp = base_df.copy().reset_index(drop=True)
#     dfp["p_high"] = p_oof
#     dfp["bucket_true"] = np.where(dfp[label_col] == 1, "bucket1 (high erosion)", "bucket2 (low erosion)")

#     # ---------- 1) Histogram by bucket ----------
#     plt.figure(figsize=(8,5))
#     for b, sub in dfp.groupby("bucket_true"):
#         plt.hist(sub["p_high"], bins=30, alpha=0.5, density=True, label=b)
#     plt.xlabel("Predicted P(high erosion)")
#     plt.ylabel("Density")
#     plt.title(f"Probability distributions by bucket {title}")
#     plt.legend()
#     plt.show()

#     # ---------- 2) Boxplot by bucket ----------
#     plt.figure(figsize=(6,5))
#     data = [dfp.loc[dfp["bucket_true"].str.contains("bucket1"), "p_high"],
#             dfp.loc[dfp["bucket_true"].str.contains("bucket2"), "p_high"]]
#     plt.boxplot(data, tick_labels=["bucket1", "bucket2"])
#     plt.ylabel("Predicted P(high erosion)")
#     plt.title(f"Boxplot of probabilities by bucket {title}")
#     plt.show()

#     # ---------- 3) Bin plot: avg prob vs true rate ----------
#     # (like a calibration curve, but also good for threshold intuition)
#     dfp["bin"] = pd.qcut(dfp["p_high"], q=10, duplicates="drop")
#     bin_stats = dfp.groupby("bin", observed=True).agg(
#         avg_p=("p_high", "mean"),
#         true_rate=("label", "mean"),
#         n=("label", "size")
#     ).reset_index()

#     plt.figure(figsize=(7,5))
#     plt.plot(bin_stats["avg_p"], bin_stats["true_rate"], marker="o")
#     plt.plot([0,1], [0,1], linestyle="--")  # perfect calibration line
#     plt.xlabel("Average predicted P(high erosion) in bin")
#     plt.ylabel("True bucket1 rate in bin")
#     plt.title(f"Calibration / separation by probability bins {title}")
#     for _, r in bin_stats.iterrows():
#         plt.text(r["avg_p"], r["true_rate"], str(int(r["n"])), fontsize=9, ha="center", va="bottom")
#     plt.show()

#     return dfp, bin_stats

# dfp_s1, bins_s1 = plot_probs_vs_buckets(eval_bucket_s1_df, proba_bucket_s1, title="(Scenario 1)")
# dfp_s2, bins_s2 = plot_probs_vs_buckets(eval_bucket_s2_df, proba_bucket_s2, title="(Scenario 2)")

### Normalized
Scenario 1 metrics: 0.6456119637937819 0.06896551724137931 \
Scenario 1 metrics: 0.648005148005148 0.08 \
Scenario 1 metrics: 0.7286324786324786 0.07142857142857142 \
Scenario 1 metrics: 0.6609116022099447 0.0 \
Scenario 1 metrics: 0.6502535925612849 0.12903225806451613 \
Mean Scenario 1 metrics: 0.6666829570405277 0.06988526934689336 \
Scenario 2 metrics: 0.9813065722156631 0.7457627118644068 \
Scenario 2 metrics: 0.9667953667953668 0.6382978723404256 \
Scenario 2 metrics: 0.9803622303622304 0.7241379310344828 \
Scenario 2 metrics: 0.9457379636937648 0.7213114754098361 \
Scenario 2 metrics: 0.9573119188503804 0.7037037037037037 \
Mean Scenario 2 metrics: 0.9663028103834812 0.7066427388705709


### Non-normalized
Scenario 1 metrics: 0.5609996064541518 0.06666666666666667 \
Scenario 1 metrics: 0.6428571428571428 0.14814814814814814 \
Scenario 1 metrics: 0.6960724460724461 0.058823529411764705 \
Scenario 1 metrics: 0.7054064719810577 0.06666666666666667 \
Scenario 1 metrics: 0.6201394759087068 0.0 \
Mean Scenario 1 metrics: 0.645095028654701 0.06806100217864923 \
Scenario 2 metrics: 0.9851436442345534 0.7272727272727273 \
Scenario 2 metrics: 0.9675675675675676 0.5909090909090909 \
Scenario 2 metrics: 0.9757834757834758 0.6666666666666666 \
Scenario 2 metrics: 0.9517561168113654 0.7272727272727273 \
Scenario 2 metrics: 0.9587912087912087 0.6666666666666666 \
Mean Scenario 2 metrics: 0.9678084026376343 0.6757575757575757

In [None]:
# train_df, eval_df_s1, eval_df_s2 = split_train_test(df)

# eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
# eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

# train_df = train_preprocessing(train_df, df_aux)

# model_s1 = fit_model(train_df, 's1', predict_avg=True)
# model_s2 = fit_model(train_df, 's2', predict_avg=True)

# eval_pred_df_s1 = predict(model_s1, eval_df_s1, 0)
# eval_pred_df_s2 = predict(model_s2, eval_df_s2, 6)

# print("Scenario 1 metric:", compute_metric1(eval_df_s1_true, eval_pred_df_s1, df_aux))
# print("Scenario 2 metric:", compute_metric2(eval_df_s2_true, eval_pred_df_s2, df_aux))

In [None]:
# # Regressor per bucket

# train_b1_df, eval_b1_df_s1, eval_b1_df_s2, train_b2_df, eval_b2_df_s1, eval_b2_df_s2 = split_buckets(df, df_aux)

# eval_b1_df_s1, eval_b1_df_s1_true = split_true(eval_b1_df_s1, 0)
# eval_b1_df_s2, eval_b1_df_s2_true = split_true(eval_b1_df_s2, 6)

# eval_b2_df_s1, eval_b2_df_s1_true = split_true(eval_b2_df_s1, 0)
# eval_b2_df_s2, eval_b2_df_s2_true = split_true(eval_b2_df_s2, 6)

# train_b1_df = train_preprocessing(train_b1_df, df_aux)
# train_b2_df = train_preprocessing(train_b2_df, df_aux)

# model_b1_s1 = fit_model(train_b1_df, 's1', predict_avg=True)
# model_b1_s2 = fit_model(train_b1_df, 's2', predict_avg=True)
# model_b2_s1 = fit_model(train_b2_df, 's1', predict_avg=True)
# model_b2_s2 = fit_model(train_b2_df, 's2', predict_avg=True)

# eval_pred_b1_df_s1 = predict(model_b1_s1, eval_b1_df_s1, 0)
# eval_pred_b1_df_s2 = predict(model_b1_s2, eval_b1_df_s2, 6)
# eval_pred_b2_df_s1 = predict(model_b2_s1, eval_b2_df_s1, 0)
# eval_pred_b2_df_s2 = predict(model_b2_s2, eval_b2_df_s2, 6)

# print("Bucket 1 Scenario 1 metric:", compute_metric1(eval_b1_df_s1_true, eval_pred_b1_df_s1, df_aux))
# print("Bucket 1 Scenario 2 metric:", compute_metric2(eval_b1_df_s2_true, eval_pred_b1_df_s2, df_aux))
# print("Bucket 2 Scenario 1 metric:", compute_metric1(eval_b2_df_s1_true, eval_pred_b2_df_s1, df_aux))
# print("Bucket 2 Scenario 2 metric:", compute_metric2(eval_b2_df_s2_true, eval_pred_b2_df_s2, df_aux))

## Fit on the whole dataset

In [None]:
all_df = df.copy()

all_df = train_preprocessing(all_df, df_aux)

model_s1 = fit_model(all_df, 's1', predict_avg=True)
model_s2 = fit_model(all_df, 's2', predict_avg=True)

# Save model
latest_id = max([
    int(f.removesuffix(".model").split("_")[-1])
    for f in os.listdir("models") if f.endswith(".model")
], default=-1)
model_s1.save_model(f"models/cb_s1_{latest_id + 1}.model")
model_s2.save_model(f"models/cb_s2_{latest_id + 1}.model")
print(f"Models saved to models/cb_s1_{latest_id + 1}.model and models/cb_s2_{latest_id + 1}.model")

In [None]:
# Creat submission
t_df, t_sub = general_preprocessing(
    pd.read_csv('data/test/df_volume_test.csv'),
    pd.read_csv('data/test/df_generics_test.csv'),
    pd.read_csv('data/test/df_medicine_info_test.csv'),
    is_test=True
)

t_df_s1, t_df_s2 = split_scenario(t_df)
t_pred_df_s1 = predict(model_s1, t_df_s1, 0)
t_pred_df_s2 = predict(model_s2, t_df_s2, 6)

t_pred = pd.concat([t_pred_df_s1, t_pred_df_s2])
t_pred = t_pred[['country', 'brand_name', 'months_postgx', 'volume']]

t_final = t_sub.merge(t_pred, on=["country", "brand_name", "months_postgx"], how="left", validate="one_to_one")

## Fit Reg + Cls on the whole dataset

In [4]:
all_df = df.copy()

# Train the classifier
all_bucket_s1_df = build_bucket_dataset(all_df, df_aux, "s1")
all_bucket_s2_df = build_bucket_dataset(all_df, df_aux, "s2")
bucket_model_s1 = fit_bucket_classifier(all_bucket_s1_df, "s1", verbose=False)
bucket_model_s2 = fit_bucket_classifier(all_bucket_s2_df, "s2", verbose=False)

# Prepare train set according to buckets
all_df = train_preprocessing(all_df, df_aux)
all_b1_df, all_b2_df = split_buckets(all_df, df_aux)

# Train regressors
model_b1_s1 = fit_model(all_b1_df, 's1', predict_avg=True, verbose=False)
model_b2_s1 = fit_model(all_b2_df, 's1', predict_avg=True, verbose=False)
model_b1_s2 = fit_model(all_b1_df, 's2', predict_avg=True, verbose=False)
model_b2_s2 = fit_model(all_b2_df, 's2', predict_avg=True, verbose=False)

# Save model
latest_id = max([
    int(f.removesuffix(".model").split("_")[-1])
    for f in os.listdir("models") if f.endswith(".model")
], default=-1)
model_b1_s1.save_model(f"models/cb_b1_s1_{latest_id + 1}.model")
model_b2_s1.save_model(f"models/cb_b2_s1_{latest_id + 1}.model")
model_b1_s2.save_model(f"models/cb_b1_s2_{latest_id + 1}.model")
model_b2_s2.save_model(f"models/cb_b2_s2_{latest_id + 1}.model")
bucket_model_s1.save_model(f"models/cb_bucket_s1_{latest_id + 1}.model")
bucket_model_s2.save_model(f"models/cb_bucket_s2_{latest_id + 1}.model")
print("Models saved to:",
      f"models/cb_b1_s1_{latest_id + 1}.model",
      f"models/cb_b2_s1_{latest_id + 1}.model",
      f"models/cb_b1_s2_{latest_id + 1}.model",
      f"models/cb_b2_s2_{latest_id + 1}.model",
      f"models/cb_bucket_s1_{latest_id + 1}.model",
      f"models/cb_bucket_s2_{latest_id + 1}.model", sep="\n")

Models saved to:
models/cb_b1_s1_6.model
models/cb_b2_s1_6.model
models/cb_b1_s2_6.model
models/cb_b2_s2_6.model
models/cb_bucket_s1_6.model
models/cb_bucket_s2_6.model


In [5]:
# Creat submission
t_df, t_sub = general_preprocessing(
    pd.read_csv('data/test/df_volume_test.csv'),
    pd.read_csv('data/test/df_generics_test.csv'),
    pd.read_csv('data/test/df_medicine_info_test.csv'),
    is_test=True
)

t_df_s1, t_df_s2 = split_scenario(t_df)
votes_s1 = predict_probas(bucket_model_s1, t_df_s1, "s1")
votes_s2 = predict_probas(bucket_model_s2, t_df_s2, "s2")
t_pred_s1_df = predict([model_b1_s1, model_b2_s1], t_df_s1, 0, votes_s1)
t_pred_s2_df = predict([model_b1_s2, model_b2_s2], t_df_s2, 6, votes_s2)

t_pred = pd.concat([t_pred_s1_df, t_pred_s2_df])
t_pred = t_pred[['country', 'brand_name', 'months_postgx', 'volume']]

t_final = t_sub.merge(t_pred, on=["country", "brand_name", "months_postgx"], how="left", validate="one_to_one")

## Save Submission

In [6]:
latest_id = max([
    int(f.removeprefix("submission").removesuffix(".csv")) for f in
    os.listdir("submissions/") if f.startswith("submission")
], default=-1)
t_final.to_csv(f"submissions/submission{latest_id + 1}.csv", index=False)
print(f"Saved to submissions/submission{latest_id + 1}.csv")

Saved to submissions/submission15.csv
