<center>
  <h2>Trabajo Laboratorio de Implementación III - MCD Virtual Cohorte 2022</h2>
  <h3>Forecasting para una Empresa de Consumo Masivo</h3>
  <h4>Creación de dataset group</h4>
</center>

In [20]:
# Importación librerias

# !pip install awswrangler
# !pip install boto3
import boto3
import awswrangler as wr
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
from dateutil.relativedelta import relativedelta
warnings.filterwarnings("ignore")

In [21]:
# Definición fechas

date_start=datetime.strptime('2017-01-01', '%Y-%m-%d')
date_end=datetime.strptime('2019-12-01', '%Y-%m-%d')
forecast_length = 2
date_future=date_end+relativedelta(months=forecast_length)
years=range(2018, 2020)

print(f"Fecha inicio historia {date_start}")
print(f"Fecha fin historia {date_end}")
print(f"Fecha futura (related time series) {date_future}")

timestamp_format="yyyy-MM-dd"

# Versión modelo
data_version = '1'
grouped_cols = "product_id"
grouped_cols_name = "product"

Fecha inicio historia 2017-01-01 00:00:00
Fecha fin historia 2019-12-01 00:00:00
Fecha futura (related time series) 2020-02-01 00:00:00


In [22]:
# Conexiones AWS

account_id=boto3.client('sts').get_caller_identity().get('Account')
session = boto3.Session() 
forecast = session.client(service_name='forecast')
region = forecast.meta.region_name
forecastquery = session.client(service_name='forecastquery')
role = f'arn:aws:iam::{account_id}:role/ForecastRole'

In [23]:
# Buckets de datos

target_s3_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/target/df_ventas_{grouped_cols_name}.csv'
items_s3_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/product/df_products.csv'
related_s3_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/related/df_related_{grouped_cols_name}.csv'

In [24]:
# Verificar comunicación con Amazon Forecast
assert forecast.list_predictors()

In [25]:
# Nombres de datasets
ts_dataset_name = f"modelo_{data_version}_{grouped_cols_name}_dataset_target"
item_dataset_name = f"modelo_{data_version}_{grouped_cols_name}_dataset_items"
related_dataset_name = f"modelo_{data_version}_{grouped_cols_name}_dataset_related"

# Nombre de dataset group
dataset_group_name = f'modelo_{data_version}_{grouped_cols_name}'

# Nombre de import jobs
target_import_job_name = f"modelo_{data_version}_{grouped_cols_name}_dataset_target_import"
items_import_job_name = f"modelo_{data_version}_{grouped_cols_name}_dataset_items_import"
related_import_job_name = f"modelo_{data_version}_{grouped_cols_name}_dataset_related_import"

<a id='datasets' />

## Crear datasets en Amazon Forecast
[(back to top)](#contents)

### Parámetros

Los datos deben ser importados sin header, por lo cual se debe definir un esquema. El orden de los atributos debe ser el mismo que el de las columnas de los archivos. 

### Target Time Series Schema

In [26]:
ts_frequency='M'
ts_schema = {
	"Attributes": [
		{
			"AttributeName": "timestamp",
			"AttributeType": "timestamp"
		},
		{
			"AttributeName": "item_id",
			"AttributeType": "string"
		},
		{
			"AttributeName": "target_value",
			"AttributeType": "float"
		}
	]
}

El siguiente codigo funciona para crear dataset target según schema brindado. La siguiente salida arroja error porque este primer modelo fue ejecutado integramente desde la UI de forecast

In [27]:
create_dataset_response = forecast.create_dataset(Domain="CUSTOM",
                                                  DatasetType='TARGET_TIME_SERIES',
                                                  DatasetName=ts_dataset_name,
                                                  DataFrequency=ts_frequency,
                                                  Schema=ts_schema)

ResourceAlreadyExistsException: An error occurred (ResourceAlreadyExistsException) when calling the CreateDataset operation: A dataset already exists with the arn: arn:aws:forecast:us-east-1:637423651905:dataset/modelo_1_product_dataset_target

In [None]:
target_dataset_arn=create_dataset_response['DatasetArn']

In [None]:
target_dataset_description=forecast.describe_dataset(DatasetArn=target_dataset_arn)

In [None]:
print(f"The Dataset with ARN {target_dataset_arn} is now {target_dataset_description['Status']}.")

### Items dataset schema

In [None]:
items_schema = {
	"Attributes": [
		{
			"AttributeName": "item_id",
			"AttributeType": "string"
		},
		{
			"AttributeName": "cat1",
			"AttributeType": "string"
		},
		{
			"AttributeName": "cat2",
			"AttributeType": "string"
		},
		{
			"AttributeName": "cat3",
			"AttributeType": "string"
		},
		{
			"AttributeName": "brand",
			"AttributeType": "string"
		},
		{
			"AttributeName": "sku_size",
			"AttributeType": "integer"
		}
	]
}

In [None]:
create_dataset_response = forecast.create_dataset(Domain="CUSTOM",
                                                  DatasetType='ITEM_METADATA',
                                                  DatasetName=item_dataset_name,
                                                  Schema=items_schema)

In [None]:
items_dataset_arn=create_dataset_response['DatasetArn']

In [None]:
items_dataset_description=forecast.describe_dataset(DatasetArn=items_dataset_arn)

In [None]:
print(f"The Dataset with ARN {items_dataset_arn} is now {items_dataset_description['Status']}.")

### Related dataset schema

In [None]:
related_dataset_frequency='M'
related_schema = {
	"Attributes": [
		{
			"AttributeName": "timestamp",
			"AttributeType": "timestamp"
		},
		{
			"AttributeName": "item_id",
			"AttributeType": "string"
		},
		{
			"AttributeName": "month_number",
			"AttributeType": "integer"
		},
		{
			"AttributeName": "quarter",
			"AttributeType": "integer"
		},
		{
			"AttributeName": "days_in_month",
			"AttributeType": "integer"
		},
		{
			"AttributeName": "sundays",
			"AttributeType": "integer"
		},
		{
			"AttributeName": "saturdays",
			"AttributeType": "integer"
		},
		{
			"AttributeName": "tn_m3",
			"AttributeType": "float"
		},
		{
			"AttributeName": "tn_m6",
			"AttributeType": "float"
		},
		{
			"AttributeName": "tn_m12",
			"AttributeType": "float"
		}
	]
}

In [None]:
create_dataset_response = forecast.create_dataset(Domain="CUSTOM",
                                                  DatasetType='RELATED_TIME_SERIES',
                                                  DatasetName=related_dataset_name,
                                                  DataFrequency=related_dataset_frequency,
                                                  Schema=related_schema)

In [None]:
related_dataset_arn=create_dataset_response['DatasetArn']

In [None]:
related_dataset_description=forecast.describe_dataset(DatasetArn=related_dataset_arn)

In [None]:
print(f"The Dataset with ARN {related_dataset_arn} is now {related_dataset_description['Status']}.")

### Dataset Group

Para entrenar un predictor, requerimos un `DatasetGroup` que agrupa los datasets de input. 

In [None]:
dataset_arns = [target_dataset_arn, items_dataset_arn, related_dataset_arn]

In [None]:
create_dataset_group_response = \
    forecast.create_dataset_group(Domain="CUSTOM",
                                  DatasetGroupName=dataset_group_name,
                                  DatasetArns=dataset_arns)
dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
describe_dataset_group_response = forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn)
print(f"The DatasetGroup with ARN {dataset_group_arn} is now {describe_dataset_group_response['Status']}.")

<a id='import' />

## Importar datasets en Amazon Forecast
[(back to top)](#contents)

### Target time series import job

In [None]:
target_dataset_import_job_response = \
    forecast.create_dataset_import_job(DatasetImportJobName=target_import_job_name,
                                       DatasetArn=target_dataset_arn,
                                       DataSource= {
                                         "S3Config" : {
                                             "Path": target_s3_path,
                                             "RoleArn": role
                                         } 
                                       },
                                       TimestampFormat=timestamp_format)

target_dataset_import_job_arn = target_dataset_import_job_response['DatasetImportJobArn']

In [None]:
forecast.describe_dataset_import_job(DatasetImportJobArn=target_dataset_import_job_arn)['Status']

### Items dataset import job

In [None]:
items_dataset_import_job_response = \
    forecast.create_dataset_import_job(DatasetImportJobName=items_import_job_name,
                                       DatasetArn=items_dataset_arn,
                                       DataSource= {
                                         "S3Config" : {
                                             "Path": items_s3_path,
                                             "RoleArn": role
                                         } 
                                       })

items_dataset_import_job_arn = items_dataset_import_job_response['DatasetImportJobArn']

In [None]:
forecast.describe_dataset_import_job(DatasetImportJobArn=items_dataset_import_job_arn)['Status']

### Related dataset import job

In [None]:

related_dataset_import_job_response = \
    forecast.create_dataset_import_job(DatasetImportJobName=related_import_job_name,
                                       DatasetArn=related_dataset_arn,
                                       DataSource= {
                                         "S3Config" : {
                                             "Path": related_s3_path,
                                             "RoleArn": role
                                         } 
                                       },
                                       TimestampFormat=timestamp_format)

related_dataset_import_job_arn = related_dataset_import_job_response['DatasetImportJobArn']

In [None]:
forecast.describe_dataset_import_job(DatasetImportJobArn=related_dataset_import_job_arn)['Status']