In [207]:
import numpy as np
import random

# Constants
NUM_ITEMS = 10
NUM_ACTIONS = 11
NUM_ACTIONS_P = 6
DISCOUNT_FACTOR = 0.9
NUM_STATES = NUM_ITEMS + 1  # Add one state for being out of stock

# Feature function for state approximation
def feature_function(state):
    return np.array([state])

# Linear function approximation for the value function
def approximate_value(theta, state):
    return np.dot(theta, feature_function(state))

# Adjust the reward function to minimize costs
def get_reward(state, action, next_state):
    restocking_cost = action
    #random.seed(20)
    sold_items = np.random.randint(0, 5)  # Simulate a random number of items sold between 0 and 5
    lost_sales_cost = max(0, sold_items - state) * 2  # Cost of lost sales due to stockouts
    return (restocking_cost + lost_sales_cost)  # We want to minimize costs

# Adjust the transition function to reflect the state changes correctly
def get_next_state(state, action):
    #random.seed(21)
    sold_items = random.randint(0, 5)
    next_state = max(0, state - sold_items + action)
    return next_state

# Function to calculate transition probabilities based on a uniform distribution
def calculate_transition_probabilities():
    probabilities = [1/6] * 6  # Probability of selling 0 to 5 items, equally likely
    return probabilities

def approximate(theta, state, action, sales_probabilities, gamma):
    next_state = get_next_state(state, action)
    reward = get_reward(state, action)
    expected_cost = sum([sales_probabilities[s] * approximate_value(theta, get_next_state(state, action)) for s in range(NUM_ACTIONS)])
    return reward + gamma * expected_cost

# Update parameter vector theta
def update_theta(theta, state, action, sales_probabilities, gamma, learning_rate):
    next_state = get_next_state(state, action)
    reward = get_reward(state, action, next_state)
    expected_cost = sum([sales_probabilities[s] * approximate_value(theta, get_next_state(state, action)) for s in range(NUM_ACTIONS_P)])
    error = reward + gamma * expected_cost - approximate_value(theta, state)
    theta += learning_rate * error * feature_function(state)
    return theta

# Approximate Dynamic Programming
def approximate_dynamic_programming():
    theta = np.random.rand(feature_function(0).shape[0])  # Initialize weights randomly
    policy = [0] * NUM_STATES  # Initialize policy
    learning_rate = 0.01  # Learning rate for theta update

    # Iterate for each day
    for day in range(10):
        # Compute the optimal policy for the current day
        for state in range(NUM_STATES):
            best_action = None
            min_expected_cost = float('inf')

            # Iterate over actions and find the best action
            for action in range(NUM_ACTIONS):
                sales_probabilities = [1/6] * 6  # Probability of selling 0 to 5 items, equally likely

                expected_cost = get_reward(state, action, next_state) + DISCOUNT_FACTOR * sum([sales_probabilities[s] * approximate_value(theta, next_state)for s in range(NUM_ACTIONS_P)])

                if expected_cost < min_expected_cost:
                    min_expected_cost = expected_cost
                    best_action = action

            # Update the policy for the current state
            policy[state] = best_action

            # Update theta (parameter vector) based on the TD error
            theta = update_theta(theta, state, best_action, sales_probabilities, DISCOUNT_FACTOR, learning_rate)
            state = next_state
            
        # Calculate an approximate expected cost for the next day (for illustration purposes)
        # For simplicity, we'll use the expected cost for the first state
        expected_cost_next_day = get_reward(0, policy[0], get_next_state(0, policy[0])) + DISCOUNT_FACTOR * sum([sales_probabilities[s] * approximate_value(theta, get_next_state(0, policy[0])) for s in range(NUM_ACTIONS_P)])
        print(f"Day {day} - Policy: {policy}, Approximate Expected Cost for Next Day: {expected_cost_next_day}")

# Run the approximate dynamic programming
approximate_dynamic_programming()


Day 0 - Policy: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 5.036224003101526
Day 1 - Policy: [1, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 1.0056176528590794
Day 2 - Policy: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 2.0
Day 3 - Policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 0.0
Day 4 - Policy: [1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 7.007965125420448
Day 5 - Policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 2.0
Day 6 - Policy: [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 4.0
Day 7 - Policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 4.0
Day 8 - Policy: [1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 5.0
Day 9 - Policy: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], Approximate Expected Cost for Next Day: 4.0


In [130]:
#Q-learning
# Constants
NUM_DAYS = 10  # Horizon
NUM_ACTIONS = 11  # Actions: 0 to 9 (restock 0 to 9 items)
MAX_ITEMS = 10 # Maximum number of items 
STATE= 11

# Q-Learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.7  # Exploration rate

# Q-table initialization
Q = np.zeros((11, NUM_ACTIONS))

# Function to get the reward for a given state, action, and next state
def get_reward(state, action, next_state):
    restocking_cost = action
    sold_items = np.random.randint(0, 6)  # Simulate a random number of items sold between 0 and 5
    lost_sales_cost = max(0, sold_items - next_state) * 2  # Cost of lost sales due to stockouts
    return restocking_cost + lost_sales_cost  # We want to minimize costs

    
# Function to get the next state based on the current state and action
def get_next_state(state, action):
    sold_items = random.randint(0, 5)
    next_state = max(0, state - sold_items + action)  # Update state based on sales
    next_state = min(next_state, MAX_ITEMS)  # Update state based on restocking; t
    return next_state

# Function to choose an action using epsilon-greedy strategy
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, NUM_ACTIONS - 1)  # Random action
    else:
        return np.argmax(Q[state, :])  # Greedy action


# Q-Learning algorithm
for day in range(NUM_DAYS):
    state = MAX_ITEMS -1  # Start with a full warehouse - 1
    
    for _ in range(MAX_ITEMS):
        # Choose an action
        action = choose_action(state)
        
        # Perform the action and get the reward and new state
        next_state = get_next_state(state, action)
        
        # Ensure next_state is within the valid range
        next_state = min(next_state, MAX_ITEMS - 1)
        
        # Get the reward for the transition
        reward = get_reward(state, action, next_state)
        
        # Update the Q-value for the current state-action pair
        old_value = Q[state, action]
        next_max = np.max(Q[next_state, :])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        Q[state, action] = new_value
        
        # Move to the new state
        state = next_state

# Print the Q-table
print("Q-table after learning:\n", Q)

Q-table after learning:
 [[ 0.          1.28905     0.5160145   0.9         0.          0.
   0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.936       0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          1.78767     0.        ]
 [ 0.4         0.          0.30684047  0.57259049  0.          0.
   0.70684047  0.          0.806561    0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.67259049  0.          0.          0.
   0.          0.          0.          0.          3.75489344]
 [ 0.13820049  0.          0.35874347  0.          0.          0.
   0.          0.          0.          0.          0.        ]
 [ 0.          0.118       0.          0.          0.          1.159369
