In [14]:
import pandas as pd
import numpy as np

from src.preprocess import general_preprocessing, add_lags, extend_test
from src.train import split_train_test, split_true, split_scenario
from src.model import fit_model, predict
from src.metrics import  metric_s1, metric_s2

In [None]:
df = general_preprocessing(
    pd.read_csv('data/train/df_volume_train.csv'),
    pd.read_csv('data/train/df_generics_train.csv'),
    pd.read_csv('data/train/df_medicine_info_train.csv')
)

train_df, eval_df_s1, eval_df_s2 = split_train_test(df, random_state=42)

eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

train_df = add_lags(train_df)

In [None]:
model_s1 = fit_model(train_df, 0, seed=42)
model_s2 = fit_model(train_df, 6, seed=42)

0:	learn: 0.2288120	total: 223ms	remaining: 7m 24s
200:	learn: 0.0604905	total: 20.6s	remaining: 3m 4s
400:	learn: 0.0522920	total: 40.9s	remaining: 2m 43s
600:	learn: 0.0464354	total: 1m 1s	remaining: 2m 22s
800:	learn: 0.0427441	total: 1m 20s	remaining: 2m 1s
1000:	learn: 0.0400341	total: 1m 40s	remaining: 1m 40s
1200:	learn: 0.0377640	total: 2m	remaining: 1m 20s
1400:	learn: 0.0358689	total: 2m 22s	remaining: 1m
1600:	learn: 0.0342243	total: 2m 42s	remaining: 40.6s
1800:	learn: 0.0328186	total: 3m 3s	remaining: 20.2s
1999:	learn: 0.0316462	total: 3m 23s	remaining: 0us
0:	learn: 0.2278611	total: 77.7ms	remaining: 2m 35s
200:	learn: 0.0574053	total: 21.3s	remaining: 3m 10s
400:	learn: 0.0501899	total: 41.6s	remaining: 2m 46s
600:	learn: 0.0446121	total: 1m	remaining: 2m 20s
800:	learn: 0.0404040	total: 1m 19s	remaining: 1m 58s
1000:	learn: 0.0373917	total: 1m 38s	remaining: 1m 38s
1200:	learn: 0.0349264	total: 1m 56s	remaining: 1m 17s
1400:	learn: 0.0329571	total: 2m 15s	remaining: 58

In [None]:
eval_pred_df_s1 = predict(model_s1, eval_df_s1, 0)
eval_pred_df_s1 = eval_pred_df_s1[eval_pred_df_s1['months_postgx'] >= 0].drop(['volume'], axis=1)
eval_pred_df_s1 = eval_pred_df_s1.merge(eval_df_s1_true, on=["country", "brand_name", "months_postgx"])


In [None]:
eval_pred_df_s2 = predict(model_s2, eval_df_s2, 6)
eval_pred_df_s2 = eval_pred_df_s2[eval_pred_df_s2['months_postgx'] >= 6].drop(['volume'], axis=1)
eval_pred_df_s2 = eval_pred_df_s2.merge(eval_df_s2_true, on=["country", "brand_name", "months_postgx"])


In [None]:
print("Scenario 1 metric:", metric_s1(eval_pred_df_s1))
print("Scenario 2 metric:", metric_s2(eval_pred_df_s2))

Scenario 1 metric: 0.28705791319552626
Scenario 2 metric: 0.14079830600107882


In [None]:
t_vol, t_sub = extend_test(pd.read_csv('data/test/df_volume_test.csv'))
t_df = general_preprocessing(
    t_vol,
    pd.read_csv('data/test/df_generics_test.csv'),
    pd.read_csv('data/test/df_medicine_info_test.csv')
)

t_df_s1, t_df_s2 = split_scenario(t_df)
t_pred_df_s1 = predict(model_s1, t_df_s1, 0)
t_pred_df_s2 = predict(model_s2, t_df_s2, 6)

t_pred = pd.concat([t_pred_df_s1, t_pred_df_s2])
t_pred["volume"] = t_pred['pred'] * t_pred['Avgj']
t_pred = t_pred[['country', 'brand_name', 'months_postgx', 'volume']]

t_final = t_sub.merge(t_pred, on=["country", "brand_name", "months_postgx"], how="left", validate="one_to_one")

In [None]:
t_final.to_csv("submission.csv", index=False)