#  Data Preparation


In [None]:
import tensorflow as tf
import os
import argparse
import sys
import random
import math
import logging
import operator
import itertools
import datetime
import numpy as np
import pandas as pd
from csv import reader
from random import randrange

FLAGS = None

#FORMAT = '%(asctime)s %(levelname)s %(message)s'
#logging.basicConfig(format=FORMAT)
#logger = logging.getLogger('tensorflow')

logger = logging.getLogger('tensorflow')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.removeHandler(logger.handlers[0])
logger.propagate = False

def sales_example(sales):
  record = {
        'sales': tf.train.Feature(float_list=tf.train.FloatList(value=sales))
  }

  return tf.train.Example(features=tf.train.Features(feature=record))

def capacity_example(capacity):
  record = {
        'capacity': tf.train.Feature(float_list=tf.train.FloatList(value=capacity))
  }

  return tf.train.Example(features=tf.train.Features(feature=record))

def stock_example(stock):
  record = {
        'stock': tf.train.Feature(float_list=tf.train.FloatList(value=stock))
  }

  return tf.train.Example(features=tf.train.Features(feature=record))

#https://stackoverflow.com/questions/553303/generate-a-random-date-between-two-other-dates
def random_date(start, end):
  return start + datetime.timedelta(
    seconds=random.randint(0, int((end - start).total_seconds())),
  )

def create_records(number_of_products, start_date, end_date, start_time_period, middle_time_period, end_time_period, orders_file, products_file, departments_file, order_products_prior_file, order_products_train_file, train_tfrecords_file, test_tfrecords_file, capacity_tfrecords_file, stock_tfrecords_file):

  stock = np.random.uniform(low=0.0, high=1.0, size=(FLAGS.number_of_products))
  with tf.io.TFRecordWriter(stock_tfrecords_file) as writer:
    logger.debug ("stock: {}".format(stock))
    tf_example = stock_example(stock)
    writer.write(tf_example.SerializeToString())

  with open(orders_file, 'r') as f:
    csv_reader = reader(f)
    next(csv_reader)
    orders_list = list(map(tuple, csv_reader))

  sorted_orders = sorted(orders_list, key = lambda x: (int(x[1]), int(x[3])))

  dated_orders = []

  i = 0
  for k, g in itertools.groupby(sorted_orders, lambda x : int(x[1])):
    item = next(g)
    order_date = random_date(start_date, end_date)
    while order_date.weekday() != int(item[4]):
      order_date = order_date + datetime.timedelta(days=1)
        
    start_date = datetime.datetime.combine(start_date, datetime.datetime.min.time())
    end_date = datetime.datetime.combine(end_date, datetime.datetime.min.time())

    order_date = datetime.datetime(order_date.year, order_date.month, order_date.day, int(item[5]), 0, 0) 
    time_period = int((order_date - start_date).total_seconds() / (60*60*6))
    dated_orders.append((int(item[0]), int(item[1]), int(item[4]), order_date, time_period))
    
    for item in g:
      order_date = order_date + datetime.timedelta(days=int(float(item[6])))
      order_date = datetime.datetime(order_date.year, order_date.month, order_date.day, int(item[5]), 0, 0)
      time_period = int((order_date - start_date).total_seconds() / (60*60*6))
      dated_orders.append((int(item[0]), int(item[1]), int(item[4]), order_date, time_period))

  orders = pd.DataFrame(dated_orders, columns =['order_id', 'user_id', 'order_dow', 'order_date', 'time_period'])

  products = pd.read_csv("products.csv")
  departments = pd.read_csv("departments.csv")
  prior_order = pd.read_csv("order_products__prior.csv")
  train_order = pd.read_csv("order_products__train.csv")

  #aisles = pd.read_csv("aisles.csv")

  ntop = int(FLAGS.top_products*products['product_id'].count())

  all_ordered_products = pd.concat([prior_order, train_order], axis=0)[["order_id", "product_id"]]

  largest = all_ordered_products[['product_id']].groupby(['product_id']).size().nlargest(ntop).to_frame()
  largest.reset_index(inplace=True)

  products_largest = pd.merge(largest, products, how="left", on="product_id")[['product_id', 'product_name', 'aisle_id', 'department_id']]

  products_departments = pd.merge(products_largest, departments, how="left", on="department_id")

  products_departments = products_departments[products_departments["department"].isin(["frozen", "bakery", "produce", "beverages", "dry goods pasta", "meat seafood", "pantry", "breakfast", "canned goods", "dairy eggs", "snacks", "deli"])]

  products_departments_list = products_departments.values.tolist()

  products_subset=set()
  while len(products_subset) < number_of_products:
    products_subset.add((random.randint(0,len(products_departments_list))))

  selected_products_departments_list = [products_departments_list[i] for i in products_subset]
  selected_products_list = [products_departments_list[i][0] for i in products_subset]

  for p, product_id in enumerate(selected_products_list):
    logger.info ("{} {}".format(p, product_id))

  selected_products_departments = pd.DataFrame(selected_products_departments_list, columns =['product_id', 'product_name', 'aisle_id', 'department_id', 'department'])

  all_ordered_products_quantity_list = []
  for item in all_ordered_products.itertuples():
    all_ordered_products_quantity_list.append((item[1], item[2], 1))
    #all_ordered_products_quantity_list.append((item[1], item[2], random.randint(1, 6)))

  all_ordered_products_quantity = pd.DataFrame(all_ordered_products_quantity_list, columns =["order_id", "product_id", 'quantity'])

  order_product_departments = pd.merge(selected_products_departments, all_ordered_products_quantity, how="left", on="product_id")
  order_product_departments_dates = pd.merge(order_product_departments, orders, how="left", on="order_id")

  grocery = order_product_departments_dates[["order_id", "product_id", "product_name", "order_date", "time_period", 'quantity']]

  shelf_capacity = ((grocery.groupby(['product_id'])['quantity'].sum()/grocery['time_period'].nunique())*4*3).to_frame()
  shelf_capacity.reset_index(inplace=True)

  with tf.io.TFRecordWriter(capacity_tfrecords_file) as writer:
    capacity = []
    for p, product_id in enumerate(selected_products_list):
      capacity.append(math.ceil(shelf_capacity[shelf_capacity['product_id'] == product_id]['quantity'].values[0]))

    logger.debug ("capacity: {}".format(capacity))
    tf_example = capacity_example(np.array(capacity, dtype=np.float32))
    writer.write(tf_example.SerializeToString())

  counter = 0
  with tf.io.TFRecordWriter(train_tfrecords_file) as writer:
    for t in range(start_time_period, middle_time_period):
      sales = []
      for p, product_id in enumerate(selected_products_list):
        sales.append(grocery[(grocery['time_period'] == t) & (grocery['product_id'] == product_id)]['quantity'].sum())
 
      logger.debug ("pediod {}: {}".format(t, sales))
      tf_example = sales_example(np.array(sales, dtype=np.float32))
      writer.write(tf_example.SerializeToString())
      counter = counter + 1

  logger.info ("created {} train sales records".format(counter))

  if end_time_period == -1:
    end_time_period = grocery['time_period'].max()

  counter = 0
  with tf.io.TFRecordWriter(test_tfrecords_file) as writer:
    for t in range(middle_time_period, end_time_period+1):
      sales = []
      for p, product_id in enumerate(selected_products_list):
        sales.append(grocery[(grocery['time_period'] == t) & (grocery['product_id'] == product_id)]['quantity'].sum())
 
      logger.debug ("pediod {}: {}".format(t, sales))
      tf_example = sales_example(np.array(sales, dtype=np.float32))
      writer.write(tf_example.SerializeToString())
      counter = counter + 1

  logger.info ("created {} test sales records".format(counter))

def main():
  create_records(FLAGS.number_of_products, FLAGS.start_date, FLAGS.end_date, FLAGS.start_time_period, FLAGS.middle_time_period, FLAGS.end_time_period, FLAGS.orders_file, FLAGS.products_file, FLAGS.departments_file, FLAGS.order_products_prior_file, FLAGS.order_products_train_file, FLAGS.train_tfrecords_file, FLAGS.test_tfrecords_file, FLAGS.capacity_tfrecords_file, FLAGS.stock_tfrecords_file)

if __name__ == '__main__':
  parser = argparse.ArgumentParser()

  parser.add_argument('--number_of_products', type=int, default=100,
            help='Subset of products from whole ~50k dataset.')
  parser.add_argument('--top_products', type=float, default=0.2,
            help='Top percentage of products to consider, so shelf capacity equal to 3-days of sales will have a reasonable number > 1.')
  parser.add_argument('--start_date', type=datetime.date.fromisoformat, default='2017-01-01',
            help='Start date random range to create timestampts.')
  parser.add_argument('--end_date', type=datetime.date.fromisoformat, default='2017-01-06',
            help='End date random range to create timestampts.')
  parser.add_argument('--start_time_period', type=int, default=0,
            help='Start timestep for train dataset.')
  parser.add_argument('--middle_time_period', type=int, default=1000,
            help='End timestep for train dataset and this is the first timestep for test dataset.')
  parser.add_argument('--end_time_period', type=int, default=-1,
            help='Last timestep for test dataset. If -1 than until the end of data.')
  parser.add_argument('--orders_file', type=str, default='orders.csv',
            help='orders file location.')
  parser.add_argument('--products_file', type=str, default='products.csv',
            help='products file location.')
  parser.add_argument('--departments_file', type=str, default='departments.csv',
            help='departments file location.')
  parser.add_argument('--order_products_prior_file', type=str, default='order_products__prior.csv',
            help='order_products_prior file location.')
  parser.add_argument('--order_products_train_file', type=str, default='order_products__train.csv',
            help='order_products_train file location.')
  parser.add_argument('--logging', default='INFO', choices=['DEBUG','INFO','WARNING','ERROR','CRITICAL'],
            help='Enable excessive variables screen outputs.')
  parser.add_argument('--train_tfrecords_file', type=str, default='train.tfrecords',
            help='train sales tfrecords output file')
  parser.add_argument('--test_tfrecords_file', type=str, default='test.tfrecords',
            help='test sales tfrecords output file')
  parser.add_argument('--capacity_tfrecords_file', type=str, default='capacity.tfrecords',
            help='shelf capacity tfrecords output file, train or test')
  parser.add_argument('--stock_tfrecords_file', type=str, default='stock.tfrecords',
            help='Stock data for each product for predict mode.')

  FLAGS, unparsed = parser.parse_known_args()

  logger.setLevel(FLAGS.logging)

  logger.debug ("Running with parameters: {}".format(FLAGS))

  main()

# Traning the model


In [None]:
import os
import sys
import argparse

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import tensorflow_addons as tfa

#tf.debugging.set_log_device_placement(True)

import numpy as np
np.set_printoptions(edgeitems=25, linewidth=10000, precision=12, suppress=True)

FLAGS = None

class Dense(tf.Module):
  def __init__(self, input_dim, output_size, activation=None, stddev=1.0):
    super(Dense, self).__init__()
    self.w = tf.Variable(
      tf.random.truncated_normal([input_dim, output_size], stddev=stddev), name='w')
    self.b = tf.Variable(tf.zeros([output_size]), name='b')
    self.activation = activation
  def __call__(self, x):
    y = tf.matmul(x, self.w) + self.b
    if (self.activation):
      y = self.activation(y)
    return y

#Policy network
class Actor(tf.Module):
  def __init__(self, num_features, num_actions, hidden_size, activation=tf.nn.relu, dropout_prob=0.1):
    super(Actor, self).__init__()
    self.layer1 = Dense(num_features, hidden_size, activation=None)
    self.layer2 = Dense(hidden_size, hidden_size, activation=None)
    self.layer3 = Dense(hidden_size, hidden_size, activation=None)
    self.layer4 = Dense(hidden_size, num_actions, activation=None)
    self.activation = activation
    self.dropout_prob = dropout_prob
  def __call__(self, state):
    #[I, P] --> [I]
    layer_output = self.layer1(state)
    #layer_output = tfa.layers.GroupNormalization(groups = 1)(layer_output) 
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer2(layer_output)
    #layer_output = tfa.layers.GroupNormalization(groups = 1)(layer_output) 
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer3(layer_output)
    #layer_output = tfa.layers.GroupNormalization(groups = 1)(layer_output) 
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer4(layer_output)
    #tf.print("layer_output:", tf.reduce_mean(layer_output, keepdims=False), layer_output[:10], output_stream=sys.stderr, summarize=-1)

    # 0 <= u <= 1 eq 3
    #layer_output = tf.nn.sigmoid(layer_output)
    return tf.nn.softmax(layer_output)
    #tf.print("sigmoid:", tf.reduce_mean(layer_output, keepdims=False), layer_output[:10], output_stream=sys.stderr, summarize=-1)

    #[I, 1] --> [I]
    #return tf.squeeze(layer_output, axis=-1, name='factor_squeeze')

#Value network
class Critic(tf.Module):
  def __init__(self, num_features, hidden_size, activation=tf.nn.relu, dropout_prob=0.1):
    super(Critic, self).__init__()
    self.layer1 = Dense(num_features, hidden_size, activation=None)
    self.layer2 = Dense(hidden_size, 1, activation=None)
    self.activation = activation
    self.dropout_prob = dropout_prob
  def __call__(self, state):
    #[I, P] --> [I]
    layer_output = self.layer1(state)
    layer_output = tfa.layers.GroupNormalization(groups = 1)(layer_output) 
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer2(layer_output)

    #[I, 1] --> [I]
    return tf.squeeze(layer_output, axis=-1, name='factor_squeeze')

def sales_parser(serialized_example):
  example = tf.io.parse_single_example(
    serialized_example,
    features={
      "sales": tf.io.FixedLenFeature([FLAGS.num_products], tf.float32)
    })

  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.to_float32(t)
      example[name] = t

  return example

def capacity_parser(serialized_example):
  example = tf.io.parse_single_example(
    serialized_example,
    features={
      "capacity": tf.io.FixedLenFeature([FLAGS.num_products], tf.float32)
    })

  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.to_float32(t)
      example[name] = t

  return example

def stock_parser(serialized_example):
  example = tf.io.parse_single_example(
    serialized_example,
    features={
      "stock": tf.io.FixedLenFeature([FLAGS.num_products], tf.float32)
    })

  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.to_float32(t)
      example[name] = t

  return example

def waste(x):
   return FLAGS.waste * x

def quantile(x, q):

  return np.quantile(x, q)

  x = tf.sort(x, direction='ASCENDING')
  pos = q * tf.cast(tf.size(x), tf.float32)
  floor_pos = tf.floor(pos)
  int_pos = tf.cast(floor_pos, tf.int32)
   
  v_diff = x[int_pos+1]-x[int_pos]
  p_diff = pos - floor_pos
   
  return x[int_pos]+v_diff*p_diff

def cross_entropy(p, q):
  return -tf.reduce_mean(tf.reduce_sum(p*tf.math.log(tf.math.maximum(1e-15, q)), axis=1))

class Env(tf.Module):
  def __init__(self, num_features, hidden_size, activation=tf.nn.relu, dropout_prob=0.1):
    super(Critic, self).__init__()
    self.layer1 = Dense(num_features, hidden_size, activation=None)
    self.layer2 = Dense(hidden_size, 1, activation=None)
    self.activation = activation
    self.dropout_prob = dropout_prob
  def __call__(self, u):
    #[I, P] --> [I]
    layer_output = self.layer1(state)
    layer_output = tfa.layers.GroupNormalization(groups = 1)(layer_output) 
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer2(layer_output)

    #[I, 1] --> [I]
    return tf.squeeze(layer_output, axis=-1, name='factor_squeeze')

def predict():
  sales_dataset = tf.data.TFRecordDataset(FLAGS.predict_file)
  capacity_dataset = tf.data.TFRecordDataset(FLAGS.capacity_file)
  stock_dataset = tf.data.TFRecordDataset(FLAGS.stock_file)

  parsed_capacity_dataset = capacity_dataset.map(capacity_parser)
  capacity = next(iter(parsed_capacity_dataset))['capacity']

  parsed_dataset = sales_dataset.map(sales_parser)

  parsed_stock_dataset = stock_dataset.map(stock_parser)
  x = next(iter(parsed_stock_dataset))['stock']
  #x = tf.divide(next(iter(parsed_stock_dataset))['stock'], capacity)

  actor = Actor(FLAGS.num_features, FLAGS.num_actions, FLAGS.hidden_size, activation=tf.nn.relu, dropout_prob=FLAGS.dropout_prob)

  checkpoint = tf.train.Checkpoint(actor=actor)
  checkpoint.restore(tf.train.latest_checkpoint(FLAGS.output_dir)).expect_partial()

  with tf.io.gfile.GFile(FLAGS.output_file, "w") as writer:
    for sales_record in parsed_dataset:
      
      sales = tf.divide(sales_record['sales'], capacity)

      q = waste(x)

      s = tf.transpose(tf.stack([x, sales, q], axis=0), perm=[1, 0])

      policy_probs = actor(s)
      policy_mask = tf.one_hot(tf.math.argmax(policy_probs, axis=-1), FLAGS.num_actions)
      action_space = tf.tile([[0, 0.005, 0.01, 0.0125, 0.015, 0.0175, 0.02, 0.03, 0.04, 0.08, 0.12, 0.2, 0.5, 1]], [FLAGS.num_products, 1])
      u = tf.boolean_mask(action_space, policy_mask)

      overstock = tf.math.maximum(0, (x + u) - 1)

      x_u = tf.math.minimum(1, x + u)

      stockout = tf.math.minimum(0, x_u - sales)

      writer.write("stock:" + ','.join(  list(map(str,   x.numpy()    ))    ) + "\n")
      writer.write("action:" + ','.join(  list(map(str,   u.numpy()    ))    ) + "\n")
      writer.write("overstock:" + ','.join(  list(map(str,   overstock.numpy()    ))    ) + "\n")
      writer.write("sales:" + ','.join(  list(map(str,   sales.numpy()    ))    ) + "\n")
      writer.write("stockout:" + ','.join(  list(map(str,   stockout.numpy()    ))    ) + "\n")
      writer.write("capacity:" + ','.join(  list(map(str,   (capacity/capacity).numpy()    ))    ) + "\n")

      x = tf.math.maximum(0, x_u - sales)

def train():
  #   LEGEND:
  #   p - number of products
  #   f - number of features
  #   t - number of timesteps in an episode
  #   n - number of actions
  #   ep - experience collection episodes

  #sales for FLAGS.num_timesteps time periods for NUM_PRODUCTS products. Sales for period [t, t+1]. so index t=0, sales from 0 until 1 
  sales_dataset = tf.data.TFRecordDataset(FLAGS.train_file).window(FLAGS.batch_size, shift=FLAGS.batch_size-1, drop_remainder=False)

  capacity_dataset = tf.data.TFRecordDataset(FLAGS.capacity_file) #, buffer_size=FLAGS.dataset_reader_buffer_size)
  parsed_capacity_dataset = capacity_dataset.map(capacity_parser)
  capacity = next(iter(parsed_capacity_dataset))['capacity']

  actor_optimizer = tf.optimizers.Adam(FLAGS.actor_learning_rate)
  critic_optimizer = tf.optimizers.Adam(FLAGS.critic_learning_rate)

  #Policy and Value networks with random weights 
  actor = Actor(FLAGS.num_features, FLAGS.num_actions, FLAGS.hidden_size, activation=tf.nn.relu, dropout_prob=FLAGS.dropout_prob)
  critic = Critic(FLAGS.num_features, FLAGS.hidden_size, activation=tf.nn.relu, dropout_prob=FLAGS.dropout_prob)

  #Counter
  global_step = tf.Variable(0)

  checkpoint_prefix = os.path.join(FLAGS.output_dir, "ckpt")
  checkpoint = tf.train.Checkpoint(critic_optimizer=critic_optimizer, actor_optimizer=actor_optimizer, critic=critic, actor=actor, step=global_step)
  status = checkpoint.restore(tf.train.latest_checkpoint(FLAGS.output_dir))

  #standard deviation
  sigma = tf.constant(0.1)

  for episode in range(FLAGS.train_episodes):
    #random initial inventory
    # 0 <= x <= 1: eq 2
    x = tf.random.uniform(shape=[FLAGS.num_products], minval=0, maxval=1, dtype=tf.dtypes.float32)
    #waste 10% of grocery inventory at the begining of the day. This is q-hat estimate!
    # q-hat: estimate of waste
    q = waste(x)

    #tf.print ("start:", x, output_stream=sys.stderr, summarize=-1)

    for batch_dataset in sales_dataset:
      with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
        experience_step = tf.constant(0)
        experience_s = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products, FLAGS.num_features]), name="experience_s")
        experience_u = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_u")
        experience_p = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products, FLAGS.num_actions]), name="experience_p")
        experience_i = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.int64, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_i")
        experience_pu = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_pu")
        experience_overstock = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_overstock")
        experience_s_prime = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products, FLAGS.num_features]), name="experience_s_prime")
        experience_r = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_r_prime")
        experience_z = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_z")
        experience_q = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_q")
        experience_quan = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_quan")

        batch_iterator = batch_dataset.map(sales_parser)

        sales = tf.divide(next(iter(batch_iterator))['sales'], capacity)

        #state is starting inventory and forecast sales during this period
        #(p), (p) --> (f, p) --> (p, f)
        s = tf.transpose(tf.stack([x, sales, q], axis=0), perm=[1, 0])

        #tf.print("x:", x, output_stream=sys.stderr, summarize=-1)
        #tf.print("sales:", sales, output_stream=sys.stderr, summarize=-1)

        #(p, f) --> (p, n)
        policy_probs = actor(s)

        for item in batch_iterator:
          sales_prime = tf.divide(item['sales'], capacity)

          #(p, n) --> (p)
          policy_index = tf.squeeze(tf.random.categorical(tf.math.log(policy_probs), 1))
          
          #(p) --> (p, n)
          policy_mask = tf.one_hot(policy_index, FLAGS.num_actions)

          #(p, n), (p, n) --> (p)
          policy_selected = tf.boolean_mask(policy_probs, policy_mask)
     
          #tf.print("mask:", mask, output_stream=sys.stderr, summarize=-1)

          action_space = tf.tile([[0, 0.005, 0.01, 0.0125, 0.015, 0.0175, 0.02, 0.03, 0.04, 0.08, 0.12, 0.2, 0.5, 1]], [FLAGS.num_products, 1])

          #(p, n), (p, n) --> (p)
          u = tf.boolean_mask(action_space, policy_mask)

          #tf.print("action:", u, output_stream=sys.stderr, summarize=-1)

          overstock = tf.math.maximum(0, (x + u) - 1)

          # 0 <= x + u <= 1: eq 4 
          #(p) + (p) --> (p)
          x_u = tf.math.minimum(1, x + u)

          # 0 <= x <= 1: eq 7
          #(p) - (p) --> (p)
          x_prime = tf.math.maximum(0, x_u - sales)

          #tf.print("x_prime:", x_prime, output_stream=sys.stderr, summarize=-1)
        
          #waste 10% of grocery inventory at the begining of the day. This is q-hat estimate!
          # q-hat: estimate of waste
          q_prime = waste(x_prime)

          #(p), (p) --> (f, p) --> (p, f)
          s_prime = tf.transpose(tf.stack([x_prime, sales_prime, q_prime], axis=0), perm=[1, 0])

          z = tf.cast(x < FLAGS.zero_inventory, tf.float32)

          quan = tf.repeat(tf.cast(quantile(x, 0.95) - quantile(x, 0.05), tf.float32), FLAGS.num_products)

          #(p), (p), (p), (p) --> (p)
          r = tf.cast(1 - z - overstock - q - quan, tf.float32)

          #tf.print("rewards:", global_step, tf.reduce_mean(r, keepdims=False), output_stream=sys.stderr, summarize=-1)

          experience_s = experience_s.write(experience_step, s)
          experience_u = experience_u.write(experience_step, u)
          experience_p = experience_p.write(experience_step, policy_probs)
          experience_i = experience_i.write(experience_step, policy_index)
          experience_pu = experience_pu.write(experience_step, policy_selected)
          experience_overstock = experience_overstock.write(experience_step, overstock)
          experience_s_prime = experience_s_prime.write(experience_step, s_prime)
          experience_r = experience_r.write(experience_step, r)
          experience_z = experience_z.write(experience_step, z)
          experience_q = experience_q.write(experience_step, q)
          experience_quan = experience_quan.write(experience_step, quan)

          #(p, f) --> (p, n)
          policy_probs = actor(s_prime)

          x = x_prime
          q = q_prime
          s = s_prime
          sales = sales_prime

          experience_step = experience_step + 1

        #(t, p, f) --> (t*p, f)
        s_batch = tf.reshape(experience_s.stack()[:experience_step, :, :], [-1, FLAGS.num_features])
        x_batch = tf.reshape(experience_s.stack()[:experience_step, :, 0], [-1])
        sal_bat = tf.reshape(experience_s.stack()[:experience_step, :, 1], [-1])
        u_batch = tf.reshape(experience_u.stack()[:experience_step, :], [-1])
        p_batch = tf.reshape(experience_p.stack()[:experience_step, :], [-1, FLAGS.num_actions])
        i_batch = tf.reshape(experience_i.stack()[:experience_step, :], [-1])
        pu_batch = tf.reshape(experience_pu.stack()[:experience_step, :], [-1])
        overstock_batch = tf.reshape(experience_overstock.stack()[:experience_step, :], [-1])
        s_prime_batch = tf.reshape(experience_s_prime.stack()[:experience_step, :, :], [-1, FLAGS.num_features])
        r_batch = tf.reshape(experience_r.stack()[:experience_step, :], [-1])
        z_batch = tf.reshape(experience_z.stack()[:experience_step, :], [-1])
        q_batch = tf.reshape(experience_q.stack()[:experience_step, :], [-1])
        quan_batch = tf.reshape(experience_quan.stack()[:experience_step, :], [-1])

        tf.print("rewards:", global_step, experience_step, tf.reduce_mean(r_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("stockouts:", global_step, experience_step, tf.reduce_mean(z_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("waste:", global_step, experience_step, tf.reduce_mean(q_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("quantile:", global_step, experience_step, tf.reduce_mean(quan_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)

        tf.print("x    :", global_step, experience_step, tf.reduce_mean(x_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("u    :", global_step, experience_step, tf.reduce_mean(u_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("p    :", global_step, experience_step, tf.reduce_mean(p_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("pu   :", global_step, experience_step, tf.reduce_mean(pu_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("o    :", global_step, experience_step, tf.reduce_mean(overstock_batch, keepdims=False), output_stream=sys.stderr, summarize=-1)
        tf.print("sales:", global_step, experience_step, tf.reduce_mean(sal_bat, keepdims=False), output_stream=sys.stderr, summarize=-1)

        #tf.print("z_batch:", experience_step, z_batch, output_stream=sys.stderr, summarize=-1)
        #tf.print("q_batch:", experience_step, q_batch, output_stream=sys.stderr, summarize=-1)
        #tf.print("quan_batch:", experience_step, quan_batch, output_stream=sys.stderr, summarize=-1)
        #tf.print("sales_batch:", experience_step, sal_bat, output_stream=sys.stderr, summarize=-1)

        #(t*p, f) --> (t*p)
        v = critic(s_batch)

        #(t*p, f) --> (t*p)
        v_prime = critic(s_prime_batch)

        y = r_batch + FLAGS.gamma*v_prime

        #(t*p, t*p, t*p) --> (t*p)
        delta = y - v
        tf.print("delta:", global_step, tf.reduce_mean(delta, keepdims=False), output_stream=sys.stderr, summarize=-1)

        #(t*p) --> (1)
        critic_loss = 0.5*tf.reduce_mean(tf.math.square(delta), keepdims=False)
        tf.print("critic loss:", global_step, critic_loss, output_stream=sys.stderr, summarize=-1)

        if global_step == 0:
          tf.print("p_old == p_batch:", output_stream=sys.stderr, summarize=-1)
          pu_old = pu_batch

        #(t*p, n), (t*p, n) --> (t*p) --> (1)
        entropy_p = cross_entropy(p_batch, p_batch)
        tf.print("entropy adjusted:", global_step, FLAGS.entropy_coefficient*entropy_p, output_stream=sys.stderr, summarize=-1)

        if FLAGS.algorithm == 'A2C':
          #(t*p), (t*p), (1) --> (1), (1) --> (1)
          actor_loss = -tf.reduce_mean(tf.math.log(tf.math.maximum(1e-15, pu_batch))*tf.stop_gradient(delta), keepdims=False) - FLAGS.entropy_coefficient*entropy_p

          #(t*p) --> (t,p) --> (p) --> (1)
          #actor_loss = -tf.reduce_mean(tf.reduce_mean(tf.reshape(tf.math.log(tf.math.maximum(1e-15, pu_batch))*delta, [-1, FLAGS.num_products]), axis=0)) - FLAGS.entropy_coefficient*entropy_p
        elif FLAGS.algorithm == 'A2C_mod':
          #(t*p), ... --> (t*p,n)
          ix_batch = tf.tile(tf.reshape(i_batch, [-1, 1]), [1, FLAGS.num_actions])

          #(t*p,n) --> (t*p,n)
          p_new = tf.nn.softmax(tf.math.log(tf.math.maximum(1e-15, p_batch)) + tf.reshape(delta, [-1, 1]) / tf.cast(tf.math.abs(ix_batch - tf.cast(tf.range(FLAGS.num_actions), tf.int64)) + 1, tf.float32))
          #(t*p,n), ... --> (t*p)
          #per_timestep_actor_loss = tf.reduce_mean(tf.math.squared_difference(p_batch, p_new), axis=-1)
          #(t*p), ... --> (1)
          actor_loss = tf.reduce_mean(per_timestep_actor_loss, axis=-1)
        elif FLAGS.algorithm == 'PPO':
          r = pu_batch/pu_old

          #(t*p,), (t*p) --> (1)
          actor_loss = -tf.reduce_mean(tf.math.minimum(r*delta,tf.clip_by_value(r,1-0.2,1+0.2)*delta), keepdims=False) - FLAGS.entropy_coefficient*entropy_p

        tf.print("actor loss:", global_step, actor_loss, output_stream=sys.stderr, summarize=-1)

        pu_old = pu_batch

        global_step.assign_add(1)

      actor_gradients = actor_tape.gradient(actor_loss, actor.variables)
      #tf.print("actor grads:", global_step, actor_gradients, output_stream=sys.stderr, summarize=-1)

      critic_gradients = critic_tape.gradient(critic_loss, critic.variables)
      #tf.print("critic grads:", global_step, critic_gradients, output_stream=sys.stderr, summarize=-1)

      actor_optimizer.apply_gradients(zip(actor_gradients, actor.variables))
      critic_optimizer.apply_gradients(zip(critic_gradients, critic.variables))

    if (episode + 1) % 10 == 0:
      checkpoint.save(file_prefix=checkpoint_prefix)

  ##tf.print ("ending:", x, output_stream=sys.stderr, summarize=-1)

  tf.print ("episode:", episode, global_step, output_stream=sys.stderr, summarize=-1)

def main():  
  if FLAGS.action == 'TRAIN':
    train()
  elif FLAGS.action == 'PREDICT':
    predict()

if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('--output_dir', type=str, default='checkpoints',
            help='Model directrory in google storage.')
    parser.add_argument('--train_file', type=str, default='data/train.tfrecords',
            help='Train file location in google storage.')
    parser.add_argument('--capacity_file', type=str, default='data/capacity.tfrecords',
            help='Shelf capacity file location in google storage.')
    parser.add_argument('--stock_file', type=str, default='data/stock.tfrecords',
            help='Stock values in prediction mode. It is random during the training.')
    parser.add_argument('--predict_file', type=str, default='data/test.tfrecords',
            help='Predict/Test file location in google storage.')
    parser.add_argument('--output_file', type=str, default='./output.csv',
            help='Prediction output.')
    parser.add_argument('--dropout_prob', type=float, default=0.1,
            help='This used for all dropouts.')
    parser.add_argument('--train_episodes', type=int, default=1000,
            help='How many times to run scenarious.')
    parser.add_argument('--num_products', type=int, default=100,
            help='How many productse. This is a subset of all products. They are some of grocery products.')
    parser.add_argument('--num_timesteps', type=int, default=1000,
            help='How many timesteps in an episode.')
    parser.add_argument('--num_features', type=int, default=3,
            help='How many features in Critic/Actor network.')
    parser.add_argument('--num_actions', type=int, default=14,
            help='How many actions for store replenishment.')
    parser.add_argument('--hidden_size', type=int, default=32,
            help='Actor and Critic layers hidden size.')
    parser.add_argument('--entropy_coefficient', type=float, default=0.001,
            help='Applied to entropy regularizing value for actor loss.')
    parser.add_argument('--gamma', type=float, default=0.99,
            help='Discount in future rewards.')
    parser.add_argument('--algorithm', default='A2C', choices=['A2C','A2C_mod','PPO'],
            help='Learning algorithm for critic and actor.')
    parser.add_argument('--waste', type=float, default=0.025,
            help='Waste of store stock for time period.')
    parser.add_argument('--num_experience_episodes', type=int, default=5,
            help='How many episodes to collect experience before starting training.')
    parser.add_argument('--num_training_epochs', type=int, default=40,
            help='How many epochs to train from experience buffer.')
    parser.add_argument('--actor_learning_rate', type=float, default=0.001,
            help='Optimizer learning rate for Actor.')
    parser.add_argument('--critic_learning_rate', type=float, default=0.001,
            help='Optimizer learning rate for Critic.')
    parser.add_argument('--logging', default='INFO', choices=['DEBUG','INFO','WARNING','ERROR','CRITICAL'],
            help='Enable excessive variables screen outputs.')
    parser.add_argument('--zero_inventory', type=float, default=1e-5,
            help='Consider as zero inventory if less than that.')
    parser.add_argument('--batch_size', type=int, default=32,
            help='Batch size.')
    parser.add_argument('--action', default='PREDICT', choices=['TRAIN','EVALUATE','PREDICT'],
            help='An action to execure.')

    FLAGS, unparsed = parser.parse_known_args()



    main()