In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import pandas as pd
import numpy as np

from src.preprocess import general_preprocessing, add_lags
from src.train import split_train_test, split_true, split_scenario, split_k_fold
from src.model import fit_model, predict
from metric_calculation import compute_metric1, compute_metric2

In [None]:
df, df_aux = general_preprocessing(
    pd.read_csv('data/train/df_volume_train.csv'),
    pd.read_csv('data/train/df_generics_train.csv'),
    pd.read_csv('data/train/df_medicine_info_train.csv')
)

## Direct

In [None]:
train_df, eval_df_s1, eval_df_s2 = split_train_test(df, random_state=42)

eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

train_df = add_lags(train_df)

model_s1 = fit_model(train_df, 0, seed=42)
model_s2 = fit_model(train_df, 6, seed=42)

eval_pred_df_s1 = predict(model_s1, eval_df_s1, 0)
eval_pred_df_s2 = predict(model_s2, eval_df_s2, 6)

print("Scenario 1 metric:", compute_metric1(eval_df_s1_true, eval_pred_df_s1, df_aux))
print("Scenario 2 metric:", compute_metric2(eval_df_s2_true, eval_pred_df_s2, df_aux))

## With Cross Validation

In [None]:
for train_df, eval_df_s1, eval_df_s2 in split_k_fold(df, random_state=42):
    eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
    eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

    train_df = add_lags(train_df)

    model_s1 = fit_model(train_df, 0, seed=42, verbose=False)
    model_s2 = fit_model(train_df, 6, seed=42, verbose=False)

    eval_pred_df_s1 = predict(model_s1, eval_df_s1, 0)
    eval_pred_df_s2 = predict(model_s2, eval_df_s2, 6)

    print('--------------------')
    print("Scenario 1 metric:", compute_metric1(eval_df_s1_true, eval_pred_df_s1, df_aux))
    print("Scenario 2 metric:", compute_metric2(eval_df_s2_true, eval_pred_df_s2, df_aux))

## Fit on the whole dataset

In [None]:
all_df = df.copy()

all_df = add_lags(all_df)

model_s1 = fit_model(all_df, 0, seed=42)
model_s2 = fit_model(all_df, 6, seed=42)

# Save model
latest_id = max([
    int(f.removesuffix(".model").split("_")[-1])
    for f in os.listdir("models")
])
model_s1.save_model(f"models/cb_s1_{latest_id + 1}.model")
model_s2.save_model(f"models/cb_s2_{latest_id + 1}.model")

In [None]:
# # Load models
# from catboost import CatBoostRegressor

# model_s1 = CatBoostRegressor()
# model_s1.load_model("models/cb_s1_2.model")

# model_s2 = CatBoostRegressor()
# model_s2.load_model("models/cb_s2_2.model")

## Create Submission

In [None]:
t_df, t_sub = general_preprocessing(
    pd.read_csv('data/test/df_volume_test.csv'),
    pd.read_csv('data/test/df_generics_test.csv'),
    pd.read_csv('data/test/df_medicine_info_test.csv'),
    is_test=True,
)

t_df_s1, t_df_s2 = split_scenario(t_df)
t_pred_df_s1 = predict(model_s1, t_df_s1, 0)
t_pred_df_s2 = predict(model_s2, t_df_s2, 6)

t_pred = pd.concat([t_pred_df_s1, t_pred_df_s2])
t_pred = t_pred[['country', 'brand_name', 'months_postgx', 'volume']]

t_final = t_sub.merge(t_pred, on=["country", "brand_name", "months_postgx"], how="left", validate="one_to_one")

In [None]:
latest_id = max([
    int(f.removeprefix("submission").removesuffix(".csv")) for f in
    os.listdir("submissions/") if f.startswith("submission")
])
t_final.to_csv(f"submissions/submission{latest_id + 1}.csv", index=False)
print(f"Saved to submissions/submission{latest_id + 1}.csv")