In [1]:
import pandas as pd
import numpy as np

from src.preprocess import general_preprocessing, add_lags, extend_test
from src.train import split_train_test, split_true, split_scenario
from src.model import fit_model, predict
from src.metrics import  metric_s1, metric_s2

In [2]:
df = general_preprocessing(
    pd.read_csv('data/train/df_volume_train.csv'),
    pd.read_csv('data/train/df_generics_train.csv'),
    pd.read_csv('data/train/df_medicine_info_train.csv')
)

train_df, eval_df_s1, eval_df_s2 = split_train_test(df, random_state=42)

eval_df_s1, eval_df_s1_true = split_true(eval_df_s1, 0)
eval_df_s2, eval_df_s2_true = split_true(eval_df_s2, 6)

train_df = add_lags(train_df)

In [3]:
model_s1 = fit_model(train_df, 0, seed=42)
model_s2 = fit_model(train_df, 6, seed=42)

0:	learn: 0.2292218	total: 208ms	remaining: 6m 56s
200:	learn: 0.0777167	total: 20.6s	remaining: 3m 4s
400:	learn: 0.0732936	total: 45.5s	remaining: 3m 1s
600:	learn: 0.0689096	total: 1m 8s	remaining: 2m 38s
800:	learn: 0.0659110	total: 1m 33s	remaining: 2m 20s
1000:	learn: 0.0636569	total: 1m 58s	remaining: 1m 57s
1200:	learn: 0.0619580	total: 2m 20s	remaining: 1m 33s
1400:	learn: 0.0605669	total: 2m 42s	remaining: 1m 9s
1600:	learn: 0.0594113	total: 3m 4s	remaining: 46s
1800:	learn: 0.0581556	total: 3m 28s	remaining: 23s
1999:	learn: 0.0571363	total: 3m 51s	remaining: 0us
0:	learn: 0.2280541	total: 112ms	remaining: 3m 44s
200:	learn: 0.0689758	total: 24.7s	remaining: 3m 41s
400:	learn: 0.0651554	total: 47.1s	remaining: 3m 8s
600:	learn: 0.0612721	total: 1m 10s	remaining: 2m 43s
800:	learn: 0.0586747	total: 1m 31s	remaining: 2m 16s
1000:	learn: 0.0562235	total: 1m 51s	remaining: 1m 51s
1200:	learn: 0.0547063	total: 2m 11s	remaining: 1m 27s
1400:	learn: 0.0531061	total: 2m 31s	remainin

In [4]:
eval_pred_df_s1 = predict(model_s1, eval_df_s1, 0)
eval_pred_df_s1 = eval_pred_df_s1[eval_pred_df_s1['months_postgx'] >= 0].drop(['volume'], axis=1)
eval_pred_df_s1 = eval_pred_df_s1.merge(eval_df_s1_true, on=["country", "brand_name", "months_postgx"])


In [5]:
eval_pred_df_s2 = predict(model_s2, eval_df_s2, 6)
eval_pred_df_s2 = eval_pred_df_s2[eval_pred_df_s2['months_postgx'] >= 6].drop(['volume'], axis=1)
eval_pred_df_s2 = eval_pred_df_s2.merge(eval_df_s2_true, on=["country", "brand_name", "months_postgx"])

In [6]:
print("Scenario 1 metric:", metric_s1(eval_pred_df_s1))
print("Scenario 2 metric:", metric_s2(eval_pred_df_s2))

Scenario 1 metric: 0.15719595717362175
Scenario 2 metric: 0.10881704642988205


In [7]:
t_vol, t_sub = extend_test(pd.read_csv('data/test/df_volume_test.csv'))
t_df = general_preprocessing(
    t_vol,
    pd.read_csv('data/test/df_generics_test.csv'),
    pd.read_csv('data/test/df_medicine_info_test.csv')
)

t_df_s1, t_df_s2 = split_scenario(t_df)
t_pred_df_s1 = predict(model_s1, t_df_s1, 0)
t_pred_df_s2 = predict(model_s2, t_df_s2, 6)

t_pred = pd.concat([t_pred_df_s1, t_pred_df_s2])
t_pred["volume"] = t_pred['pred'] * t_pred['Avgj']
t_pred = t_pred[['country', 'brand_name', 'months_postgx', 'volume']]

t_final = t_sub.merge(t_pred, on=["country", "brand_name", "months_postgx"], how="left", validate="one_to_one")

In [8]:
t_final.to_csv("submission.csv", index=False)