In [313]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from scipy.stats import poisson, uniform, expon, pareto
from tqdm import tqdm
from mdptoolbox import mdp, util
import itertools
from scipy.sparse import csr_matrix, lil_matrix
import math

In [4]:
# TRANSITIONS (A,S,S)
# REWARDS     (A,S,S)

In [145]:
# construct possible histories
all_combinations = itertools.product(range(6), repeat=4)
histories = np.asarray([combination for combination in all_combinations if sum(combination) <= 5])
histories[-1], len(histories)

(array([5, 0, 0, 0]), 126)

In [146]:
# construct pendings
all_combinations = itertools.product(range(11), repeat=2)
pendings = np.asarray([combination for combination in all_combinations if sum(combination) <= 20])
pendings[-1], len(pendings)

(array([10, 10]), 121)

In [147]:
pendings[100]

array([9, 1])

^ interpret that as... 9 of 1, 1 of 10

so at most 10 pending at a given time. 2 diff values.

In [148]:
len(histories), len(pendings)

(126, 121)

In [149]:
len(pendings) * len(histories)

15246

In [157]:
def tostr(arr):
    return ','.join(str(x) for x in arr)

In [434]:
states = []
stateInds = {}
ind = 0
for i in tqdm(range(len(pendings))):
    for j in range(len(histories)):
        p, h = pendings[i], histories[j]
        full = np.concatenate((p,h))
        states.append(full)
        strrep = tostr(full)
        stateInds[strrep] = ind
        ind += 1
states = np.asarray(states)
len(stateInds), len(states)

100%|██████████| 121/121 [00:00<00:00, 746.25it/s]


(15246, 15246)

In [360]:
6*15246*15246 # num of elements, need to use sparsity

1394643096

In [374]:
def getReward(state):
    p1, p10 = state[0], state[1]
    return - (p1 + p10*10)

In [375]:
states[12151], getReward(states[12151]) # reward is negative of new state

(array([8, 8, 0, 5, 0, 0]), -88)

In [395]:
# all the values and probabilities for potential new pendings
numProbs = {0:0.5, 1:0.4, 5:0.1}
newPendings = [
    (0,0),
    (1,0),(0,1),
    (5,0),(4,1),(3,2),(2,3),(1,4),(0,5)]
newPendingProbs = [
    (0.9**ones)*(0.1**tens)*math.comb(ones+tens,ones)*numProbs[ones+tens]
    for (ones,tens) in newPendings]
newPendingProbs, sum(newPendingProbs)

([0.5,
  0.36000000000000004,
  0.04000000000000001,
  0.05904900000000001,
  0.032805,
  0.007290000000000002,
  0.0008100000000000004,
  4.500000000000001e-05,
  1.0000000000000004e-06],
 1.0)

In [372]:
### SKIP
curInd = 0
state = states[curInd]
for i, (ones, tens) in enumerate(newPendings):
    newState = np.copy(state)
    newState[0] += ones
    newState[1] += tens
    prob = newPendingProbs[i]
    try:
        newInd = stateInds[tostr(newState)]
    except KeyError: # not a valid new state, leave pendings
        newInd = curInd
    transitions[0][curInd, newInd] = prob
    rewards[0][curInd, newInd] = getReward(newState)
    print(i, state, newState, prob, newInd, getReward(newState))

0 [0 0 0 0 0 0] [0 0 0 0 0 0] 0.5 0 0
1 [0 0 0 0 0 0] [1 0 0 0 0 0] 0.36000000000000004 1386 -1
2 [0 0 0 0 0 0] [0 1 0 0 0 0] 0.04000000000000001 126 -10
3 [0 0 0 0 0 0] [5 0 0 0 0 0] 0.05904900000000001 6930 -5
4 [0 0 0 0 0 0] [4 1 0 0 0 0] 0.032805 5670 -14
5 [0 0 0 0 0 0] [3 2 0 0 0 0] 0.007290000000000002 4410 -23
6 [0 0 0 0 0 0] [2 3 0 0 0 0] 0.0008100000000000004 3150 -32
7 [0 0 0 0 0 0] [1 4 0 0 0 0] 4.500000000000001e-05 1890 -41
8 [0 0 0 0 0 0] [0 5 0 0 0 0] 1.0000000000000004e-06 630 -50


In [449]:
transitions = [lil_matrix((15246,15246), dtype=float) for _ in range(6)]
rewards = [lil_matrix((15246,15246), dtype=float) for _ in range(6)]
transitions[0].size, transitions[0].shape

(0, (15246, 15246))

In [450]:
# action = 0, do nothing, reward from newstate
for curInd, state in tqdm(enumerate(states)):
    for i, (ones, tens) in enumerate(newPendings):
        newState = np.copy(state)
        newState[0] += ones
        newState[1] += tens
        prob = newPendingProbs[i]
        try:
            newInd = stateInds[tostr(newState)]
        except KeyError: # not a valid new state, leave state alone
            newInd = curInd
        transitions[0][curInd, newInd] += prob
        rewards[0][curInd, newInd] = getReward(states[newInd])

15246it [00:03, 4072.54it/s]


In [451]:
# action = a, pop a single wdraw, reward from newstate
for a in range(1,6):
    for curInd, state in tqdm(enumerate(states)):
        interState = np.copy(state)
        reward = 0
        if sum(interState[2:]) + a > 5: # illegal action: violation of constraint
            transitions[a][curInd, curInd] += 1 # stay 
            rewards[a][curInd, curInd] = -10000 # big penalty

        elif sum(interState[:2]) < a:   # illegal action: not enough to process
            transitions[a][curInd, curInd] += 1 # stay
            rewards[a][curInd, curInd] = -10000 # big penalty
        else: # legal action: process
            p1, p10 = interState[0], interState[1]
            remTens = a - p10
            if remTens <= 0:  # only 10s consumed
                p10 -= a
            else:             # use the remainder to process 1s
                p10 = 0
                p1 -= remTens
            interState[0] = p1
            interState[1] = p10

            # shift history
            interState[3:6] = interState[2:5]
            interState[2] = a

            # new pendings
            for i, (ones, tens) in enumerate(newPendings):
                newState = np.copy(interState)
                # add new withdrawals
                newState[0] += ones
                newState[1] += tens
                prob = newPendingProbs[i]
                try:
                    newInd = stateInds[tostr(newState)]
                except KeyError: # not a valid new state, leave state alone
                    newInd = curInd
                transitions[a][curInd, newInd] += prob
                rewards[a][curInd, newInd] = getReward(states[newInd])

15246it [00:02, 6149.94it/s]
15246it [00:01, 11197.06it/s]
15246it [00:00, 21491.25it/s]
15246it [00:00, 34364.61it/s]
15246it [00:00, 47805.50it/s]


In [452]:
transitions = np.asarray([
    csr_matrix(transitions[0]),
    csr_matrix(transitions[1]),
    csr_matrix(transitions[2]),
    csr_matrix(transitions[3]),
    csr_matrix(transitions[4]),
    csr_matrix(transitions[5]),
])
rewards = np.asarray([
    csr_matrix(rewards[0]),
    csr_matrix(rewards[1]),
    csr_matrix(rewards[2]),
    csr_matrix(rewards[3]),
    csr_matrix(rewards[4]),
    csr_matrix(rewards[5]),
])
transitions[1].shape

(15246, 15246)

In [453]:
util.check(transitions, rewards)

In [454]:
vi = mdp.ValueIteration(transitions, rewards, discount=0.9)
vi.run()
vi.iter

110

In [460]:
vi.V[0], vi.policy[0]

(-33.49691736539085, 0)

In [472]:
np.asarray(vi.policy).mean()

0.6148497966679785

In [447]:
for i in range(15246):
    pr = transitions[1][i].sum()
    if pr < 0.99 or pr > 1.01:
        print(i)
        break

0


In [448]:
transitions[1][0].sum(), transitions[1][0].indices

(5.0, array([0], dtype=int32))

In [425]:
states[126], states[686]

(array([0, 1, 0, 0, 0, 0]), array([0, 5, 1, 0, 0, 0]))

In [433]:
curInd = 126
state = states[curInd]
interState = np.copy(state)
reward = 0
if sum(interState[2:]) + 1 > 5: # illegal action: violation of constraint
    print("bad")
elif sum(interState[:2]) < 1:   # illegal action: not enough to process
    print("bad")
else: # legal action: process
    print("good")
    p1, p10 = interState[0], interState[1]
    remTens = 1 - p10
    if remTens <= 0:  # only 10s consumed
        p10 -= 1
    else:             # use the remainder to process 1s
        p10 = 0
        p1 -= remTens
    interState[0] = p1
    interState[1] = p10
    print(state)

    interState[3:6] = interState[2:5]
    interState[2] = 1
    print(interState)
    for i, (ones, tens) in enumerate(newPendings):
        newState = np.copy(interState)
        # add new withdrawals
        newState[0] += ones
        newState[1] += tens
        prob = newPendingProbs[i]
        try:
            newInd = stateInds[tostr(newState)]
        except KeyError: # not a valid new state, leave state alone
            newInd = curInd
        print(newState, prob, curInd, newInd)
#     transitions[1][curInd, newInd] += prob
#     rewards[1][curInd, newInd] = getReward(states[newInd])

good
[0 1 0 0 0 0]
[0 0 1 0 0 0]
[0 0 1 0 0 0] 0.5 126 56
[1 0 1 0 0 0] 0.36000000000000004 126 1442
[0 1 1 0 0 0] 0.04000000000000001 126 182
[5 0 1 0 0 0] 0.05904900000000001 126 6986
[4 1 1 0 0 0] 0.032805 126 5726
[3 2 1 0 0 0] 0.007290000000000002 126 4466
[2 3 1 0 0 0] 0.0008100000000000004 126 3206
[1 4 1 0 0 0] 4.500000000000001e-05 126 1946
[0 5 1 0 0 0] 1.0000000000000004e-06 126 686


In [401]:
util.check(transitions, rewards)

StochasticError: 'PyMDPToolbox - The transition probability matrix is not stochastic.'

In [3]:
mdp.ValueIteration()

TypeError: __init__() missing 3 required positional arguments: 'transitions', 'reward', and 'discount'