In [1]:
import numpy as np
import pandas as pd

In [2]:
import sys
sys.path
sys.path.append('./scripts/')

In [3]:
import POMDPLearn as pom

# 2DMaze Dataset of 1000 agents

- In this maze the aim of the agent is to experience maximum reward
- The dataset in our posetion was generated by 1000 agents with a fixed reward over states with some random noise over all states for each agent, thus describing how every agent has a different objective. The executed policy of each agent over a horizon of 5 epochs was used to generate the dataset shown below.
- The model learned from all agents is an attempt to identify a policy that satisfies the majority of agents.

| 0  | 1  | 2  | 3  | Reward  |
|----|----|----|----|---------|
| 4  | 5  | 6  | 7  | Penalty |
| 8  | 9  | 10 | 11 | 12      |
| 13 | 14 | 15 | 16 | 17      |

In [4]:
df = pd.read_csv('./Datasets/2DMaze.csv')
df.head(3)

Unnamed: 0,agent_id,horizon_step_0,horizon_step_1,horizon_step_2,horizon_step_3,horizon_step_4
0,0,"(9, 1)","(13, 0)","(13, 0)","(13, 0)","(13, 0)"
1,1,"(16, 1)","(10, 0)","(10, 0)","(10, 0)","(10, 0)"
2,2,"(4, 1)","(1, 1)","(6, 0)","(6, 0)","(6, 0)"


# Steps:

## Data preprocessing

- The state of the dataset to be used by the POMDPLearn library must satisfy the following criteria
    - States,action, and observations must be separate colummns with the keyword "state_", "action_", "obs_" followed by the number of the epoch in the horizon.

In [5]:
def getStateAction(x):
    """Method that parses actions and state from string '(state,action)'"""
    return [int(i) for i in x.replace('(','').replace(')','').split(', ')]

In [6]:
from tqdm import tqdm

## Adding state and action columns over time to dataframe

In [7]:
hor_cols = [i for i in df.columns if i != 'agent_id']

count = 0
for col in tqdm(hor_cols):
    new_cols = ['state' + str(count),'action' + str(count)]
    df[new_cols] = df[col].apply(lambda x: pd.Series(data=getStateAction(x),index=new_cols)) 
    count += 1

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:01<00:00,  2.51it/s]


In [8]:
df.head(5)

Unnamed: 0,agent_id,horizon_step_0,horizon_step_1,horizon_step_2,horizon_step_3,horizon_step_4,state0,action0,state1,action1,state2,action2,state3,action3,state4,action4
0,0,"(9, 1)","(13, 0)","(13, 0)","(13, 0)","(13, 0)",9,1,13,0,13,0,13,0,13,0
1,1,"(16, 1)","(10, 0)","(10, 0)","(10, 0)","(10, 0)",16,1,10,0,10,0,10,0,10,0
2,2,"(4, 1)","(1, 1)","(6, 0)","(6, 0)","(6, 0)",4,1,1,1,6,0,6,0,6,0
3,3,"(10, 1)","(7, 1)","(18, 0)","(18, 0)","(18, 0)",10,1,7,1,18,0,18,0,18,0
4,4,"(12, 1)","(19, 1)","(18, 0)","(18, 0)","(18, 0)",12,1,19,1,18,0,18,0,18,0


### Keeping state and action columns

In [9]:
cols = []
for i in range(5):
    cols.append('state' + str(i))
    cols.append('action' + str(i))

df_MDP = df[cols]
df_MDP.head(3)

Unnamed: 0,state0,action0,state1,action1,state2,action2,state3,action3,state4,action4
0,9,1,13,0,13,0,13,0,13,0
1,16,1,10,0,10,0,10,0,10,0
2,4,1,1,1,6,0,6,0,6,0


    
## Dataset and Model definition

### Dataset

- The preprocessed dataset will be input to the MDP or POMDP dataset object
- Using this dataset object we define an MDP or POMDP object

### MDP model definition

- By calling the MDP class and passing the pandas dataframe of our MDP dataset we automatically instantiate an MDP dataset

```
mdpModel = MDP(df=dfMDP_dataset)
```

In [10]:
mdpDataset = pom.MDPDataset(df=df_MDP)

1000it [00:00, 170486.30it/s]
1000it [00:00, 106508.48it/s]


In [11]:
#action 0: change state
#action 1: stay in state
mdpDataset.unique_actions

array([0, 1])

In [12]:
mdpDataset.unique_states

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [13]:
mdpAgents = pom.MDP(states=mdpDataset.unique_states,actions=mdpDataset.unique_actions,
                   horizon=mdpDataset.horizon)

## Training and solving the MDP or POMDP model

- Using the trainMDP method of the MDP class we train the MDP model

- Using the MDPsolve() method we solve the MDP model using value iteration

In [14]:
mdpAgents.trainMDP(MDPDataset=mdpDataset)

Learning the transition matrix ...
EM iteration 1, loglik = -11101.0710
EM iteration 2, loglik = -2675.5772
EM iteration 3, loglik = -2154.1505
EM iteration 4, loglik = -2014.1984
EM iteration 5, loglik = -1953.6512
EM iteration 1, loglik = -5303.4051
EM iteration 2, loglik = -3448.4711
EM iteration 3, loglik = -3142.1215
EM iteration 4, loglik = -2995.2675
EM iteration 5, loglik = -2909.4094
Learning the rewards ...


In [15]:
np.argsort(mdpAgents.T[1],axis=1)[:,:15:-1][:5] #most probable transition is to remain in state

array([[ 1,  5,  4, 18],
       [ 2,  6,  0, 18],
       [ 3, 19, 17,  4],
       [10, 16,  8,  7],
       [ 1,  9, 15,  5]])

### Solving

In [16]:
mdpAgents.MDPSolve()

In [17]:
mdpAgents.U

array([3.71567946, 4.73049575, 6.68624595, 7.01527192, 3.50242081,
       4.58784972, 5.26657209, 7.90858839, 3.56262332, 4.01421098,
       5.9828596 , 5.8836767 , 6.38740958, 3.03284842, 3.82930125,
       4.10985025, 4.43091897, 4.9238    , 9.98934481, 7.63210325])

## Policy execution

- The policy obtained using the MDPsolve() method can be executed for any initial state using the policyExecution() method


###### Note

In the new transition matrices learned by the agents the probabilities of transitioning are not zero, this is why we observe transitions after the action of stay.

In [18]:
mdpAgents.policy

array([1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 0., 0.])

In [19]:
# returns state action pair of the policy over a horizon of 5 epochs 
mdpAgents.policyExecution(np.random.randint(0,20,4))

array([[[ 8,  1],
        [ 5,  1],
        [ 2,  0],
        [18,  0],
        [18,  0]],

       [[ 3,  0],
        [18,  0],
        [18,  0],
        [18,  0],
        [18,  0]],

       [[ 6,  1],
        [ 3,  0],
        [18,  0],
        [18,  0],
        [18,  0]],

       [[ 8,  1],
        [ 5,  1],
        [ 2,  0],
        [18,  0],
        [18,  0]]])