# Test OneTimeLearning
According to the paper, when $B=\min_i b_i\ge \frac{6m \log(\frac{n}{\epsilon})}{\epsilon^3}$. The OneTimeLearning Algorithm is $1-6\epsilon$ competitive. 

In fact, to $1-6\epsilon > 0$ means $\frac{6m \log(\frac{n}{\epsilon})}{\epsilon^3}\ge 1296 m\log(6n)\ge 2592 \log(6n)$. And we need to make sure $n>B$, or the best solution of primal offline linear programming is one for all the variables. That means we need to set $n\ge 31000$, which is already a large number. If we want to set epsilon=0.1, $n \ge 170000$ is required. The problem setting is too big to run it on a personal laptop. Reminds the size of matrix $a\in\mathbb{R}^{m\times n}$.

Thus we will turn to other parameter setting to check the performance of OneTimeLearning when $B=\min_i b_i\ge \frac{6m \log(\frac{n}{\epsilon})}{\epsilon^3}$ doesn't hold.

In [1]:
# import pacakges
from Source.env import Env
from Source.agent import OneTimeLearning
import numpy as np
from scipy.optimize import linprog

In [2]:
# Offline linear programming as benchmark
m = 4
n = 3000
epsilon = 0.1
random_seed = 0
# B = 6 * m * np.log(n / epsilon) / epsilon**3
B = 1000
print(f"B is {B}")
b = B * np.ones(m)

np.random.seed(random_seed)
pi = np.random.uniform(low=0.0, high=1.0, size=(n))
a = np.random.uniform(low=0.0, high=1.0, size=(m, n))
opt_res = linprog(c=-pi, A_ub=a, b_ub=b, bounds=[(0.0, 1.0)] * n)
print(f"offline optimal is {-opt_res.fun}")

B is 1000
offline optimal is 1364.2004331242583


In [4]:
# conduct the experiments
from tqdm import tqdm
n_experiments = 1000

reward_experiments_ = np.zeros(n_experiments)

for exp_id in tqdm(range(n_experiments)):
    env = Env(m=m, n=n, b=b, pi=pi, a=a, random_seed=random_seed+exp_id)
    agent = OneTimeLearning(m=m, n=n, epsilon=epsilon, b=b)
    while not env.if_stop():
        pi_t, a_t = env.deal()
        agent.action(pi_t=pi_t, a_t=a_t)
    reward_experiments_[exp_id] = np.sum(agent.reward_)

print(f"mean online reward {np.mean(reward_experiments_)}")

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:34<00:00, 10.56it/s]

mean online reward 1163.1932210182342





# Test DynamicLearning

We use the same parameter setting as above to check whether the Dynamic Learning algorithm will be better

In [1]:
# import pacakges
from Source.env import Env
from Source.agent import DynamicLearning
import numpy as np
from scipy.optimize import linprog

In [2]:
# parameters of Offline linear programming
m = 4
n = 100
epsilon = 0.1
random_seed = 0
# B = 6 * m * np.log(n / epsilon) / epsilon**3
B = 40
print(f"B is {B}")
b = B * np.ones(m)

np.random.seed(random_seed)
pi = np.random.uniform(low=0.0, high=1.0, size=(n))
a = np.random.uniform(low=0.0, high=1.0, size=(m, n))

B is 40


In [4]:
opt_res = linprog(c=-pi, A_ub=a, b_ub=b, bounds=[(0.0, 1.0)] * n)
print(f"offline optimal is {-opt_res.fun}")

offline optimal is 45.29927538403864


In [3]:
# conduct the experiments
from tqdm import tqdm
n_experiments = 1000

reward_experiments_ = np.zeros(n_experiments)

for exp_id in tqdm(range(n_experiments)):
    env = Env(m=m, n=n, b=b, pi=pi, a=a, random_seed=random_seed+exp_id)
    agent = DynamicLearning(m=m, n=n, epsilon=epsilon, b=b)
    while not env.if_stop():
        pi_t, a_t = env.deal()
        agent.action(pi_t=pi_t, a_t=a_t)
    reward_experiments_[exp_id] = np.sum(agent.reward_)

print(f"mean online reward {np.mean(reward_experiments_)}")

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:28<00:00, 35.21it/s]

mean online reward 1.5089851290228125



