<a href="https://colab.research.google.com/github/wandb/edu/blob/main/decision-opt-course/3_dynamic_decision_opt_data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
<!--- @wandbcode{decisionopt-nb3a} -->

# Lesson 3 - Dynamic Decision Optimization - Prepare data

In [None]:
import os
import pandas as pd
import pickle
import torch
import wandb
from pathlib import Path
from utils.modeling import make_model

os.environ["WANDB_QUIET"] = "true" 
wandb_project = "decision_opt_bimbo"

In [None]:
# Let's load the data from a W&B artifact
with wandb.init(project=wandb_project) as run:
    artifact = run.use_artifact(
        "danbecker/edu-decision-opt-course/course-data:v0"
    )
    data_dir = Path(artifact.download())

all_data = pd.read_csv(data_dir/'bimbo/train.csv')
all_data.head()

In [None]:
all_data.Semana.value_counts().sort_index()

In [None]:
MIN_ML_MODEL_WEEK = 3
MAX_ML_MODEL_WEEK = 4
MIN_DECISION_MODEL_WEEK = 5
MAX_DECISION_MODEL_WEEK = 9

In [None]:
store_product_group_cols = ['Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
store_product_value_counts = all_data.groupby(store_product_group_cols).size()

In [None]:
store_product_value_counts.describe()

In [None]:
full_filled_cases = (store_product_value_counts == 7)
full_filled_data = all_data.set_index(store_product_group_cols)[full_filled_cases]
full_filled_data.reset_index(inplace=True)

In [None]:
full_filled_data.shape

In [None]:
prediction_data = full_filled_data.query("Semana >= @MIN_ML_MODEL_WEEK and Semana <= @MAX_ML_MODEL_WEEK")
decision_data = full_filled_data.query("Semana >= @MIN_DECISION_MODEL_WEEK and Semana <= @MAX_DECISION_MODEL_WEEK")
model, encoder = make_model(prediction_data, run_name="prediction_model")
torch.save(model, 'predictive_model.pt')
with open('catgeorical_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


In [None]:
prediction_data.to_parquet('prediction_data.parquet')
decision_data.to_parquet('decision_data.parquet')

wandb_project = "decision_opt_bimbo"
with wandb.init(wandb_project):
    dynamic_optimization_artifact = wandb.Artifact('dynamic_optimization_data', type='dataset')
    dynamic_optimization_artifact.add_file('prediction_data.parquet', name='prediction_data.parquet')    
    dynamic_optimization_artifact.add_file('decision_data.parquet', name='decision_data.parquet')
    wandb.log_artifact(dynamic_optimization_artifact)