In [1]:
from stormvogel import *
from stormvogel.extensions import *

In [2]:
import gymnasium as gym
from collections import defaultdict
from stormvogel import pgc

def sample_gym(env: gym.Env, no_samples: int=10, sample_length:int = 1000, state_limit:int = 5_000, gymnasium_scheduler=None):
    """Sample the gym environment and convert it to a Stormvogel MDP. 
    Probablities are frequentist estimates. Their accuracy depends on how often each state is visited.
    When visiting a state multiple times, we always try to increase accuracy uniformly.
    States with the same observations cannot be distinguished, hence they become the same state in the resulting MDP, 
    even though they might be different in the gymnasium environment.

    Args:
        env (gym.Env): Gymnasium env.
        no_samples (int): Total number of samples (starting at an initial state).
            To resolve multiple initial states, a new, single initial state is always added.
        sample_length (int): The maximum length of a single sample.
        state_limit (int): Maximum number of states to explore. 
            If this is reached, the sampling will simply stop (which leads to it returning a partial MDP)
    """
    initial_states = set()
    visited_states = set()
    transition_counts = defaultdict(lambda: defaultdict(lambda: 0))
    transition_samples = defaultdict(lambda: 0)
    reward_sums = defaultdict(lambda: 0)
    labels = defaultdict(lambda: [])

    for s_no in range(no_samples):
        prev_state = None
        obs, _ = env.reset()
        state = (obs, False)
        initial_states.add(state)
        for _ in range(sample_length):
            action = (
                env.action_space.sample()
                if gymnasium_scheduler is None
                else gymnasium_scheduler(state)
            )
            prev_state = state
            obs, reward, terminated, truncated, info = env.step(action)
            state = (obs, terminated)
            visited_states.add(state)
            transition_counts[(prev_state, action)][state] += 1
            transition_samples[(prev_state, action)] += 1
            reward_sums[(prev_state, action)] += reward
            if terminated:
                break
    # print(initial_states)
    # print(visited_states)
    # print(transition_counts)
    # print(reward_sums)

    ALL_ACTIONS = [pgc.Action([str(x)]) for x in range(env.action_space.n)]
    INV_MAP = {a.labels[0]: no for no, a in enumerate(ALL_ACTIONS)}

    if len(initial_states) == 1:
        (init,) = initial_states
    else:
        init = None
    

    def available_actions(s):
        if s is None or s[1]:
            return [pgc.PgcEmpytAction]
        return ALL_ACTIONS
        
    def delta(s, a):
        if s is None:
            return [(1 / len(initial_states), s_) for s_ in initial_states]
        elif s[1]:
            return []
        return [(count / transition_samples[(s, INV_MAP[a.labels[0]])], s_) for s_, count in transition_counts[(s,INV_MAP[a.labels[0]])].items()]

    def rewards(s, a):
        return {"R":0}

    def labels(s):
        return [s]
        # if s is None:
        #     return []
        # done = ["done"] if s[1] else []
        # return [str(x) for x in s[0]] + done
        

    return pgc.build_pgc(
        delta=delta,
        initial_state_pgc=init,
        available_actions=available_actions,
        labels=labels,
        rewards=rewards,
        modeltype=stormvogel.model.ModelType.MDP,
    )
            
    
    

In [3]:
env = gym.make("Blackjack-v1", render_mode="rgb_array")
model = sample_gym(env, no_samples=10)
print(model)
#show(model)

Action with labels frozenset()
State 0 with labels ['init'] and valuations {}
[Action(labels=frozenset())]
Action with labels frozenset({'0'})
State 1 with labels [] and valuations {}
[Action(labels=frozenset({'1'}))]


RuntimeError: This action is not available in this state