In [1]:
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from numba import njit
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter
import random

In [2]:
def calculate_epsilon(T):
    epsilon_values = []
    
    for i in range(T):
        theta = -((1/1000000) ** (1/T)) + 1
        epsilon = (1 - theta) ** i
        epsilon_values.append(epsilon)
    
    return epsilon_values

In [3]:
def demand(p1t,p2t):
    """
    args:
        p1t: price of agent 1
        p2t: price of agent 2
    returns:
        d: demand for given set of prices
    """
    if p1t<p2t:
        d = 1-p1t
    elif p1t==p2t:
        d = 0.5*(1-p1t)
    else:
        d = 0
    return d

In [4]:
def profit(p1t, p2t, price_table):
    """
    args:
        p1t: index price of agent 1
        p2t: index price of agent 2
    returns:
        profit for agent
    """
    return price_table[p1t] * demand(price_table[p1t], price_table[p2t])
 

In [5]:
def select_price_greedy(Q, current_s, p, epsilon):
    """Epsilon-greedy action selection.
    args:
        Q: Q-function
        current_s: current state
        p: price vector containing the possible prices
        epsilon: probability of selecting an action uniformly at random
    returns:
        the index of selected action
    """
    u = random.uniform(0,1)
    if u < epsilon:
        #return np.random.choice(p)
        random_index = np.random.choice(len(p))
        return random_index
    else:

        max_idx = np.argmax(Q[np.where(p == current_s)[0][0], :])
        #max_idx = np.argmax(Q[current_s, :])
        return max_idx

In [None]:
def Q_func(price_idx, current_state_idx, q_table, price_table, delta, epsilon, alpha):
    """
    args
        price_idx: current price index
        current_state_idx: current state index
        q_table: q_table for 1 player
    returns:
        updated Q-table
    """
    prev_est = q_table[price_idx, current_state_idx]
    next_state_idx = select_price_greedy(q_table, price_table[current_state_idx], price_table, epsilon)
    new_est = profit(price_idx, current_state_idx) + delta * profit(price_idx, next_state_idx, price_table) + delta**2 * np.argmax(q_table[:, next_state_idx])
    q_table[price_idx, current_state_idx] = (1 - alpha) * prev_est + alpha * new_est
    return q_table
    

In [None]:
Q_func()