In [1]:
import numpy as np
import gym

from model import LinearModel

In [2]:
env = gym.make('CartPole-v1')

print(f'Max Steps in an episode: {env.spec.max_episode_steps}')
print(f'Reward threshold: {env.spec.reward_threshold}')
print(f'State Space: {env.observation_space.shape}')
print(f'Action Space: {env.action_space}')
print('State Example:')
print(env.observation_space.sample())

Max Steps in an episode: 500
Reward threshold: 475.0
State Space: (4,)
Action Space: Discrete(2)
State Example:
[ 3.5333614e+00 -1.3256218e+37 -3.9164603e-01  1.0614501e+38]


In [8]:
import torch
from collections import OrderedDict


class Policy:

    def __init__(self, n_state, n_action, stochastic_policy=False, random_seed=110):
        self.n_state = n_state
        self.n_action = n_action
        self.stochastic_policy = stochastic_policy
        # Initialize weight matrix
        np.random.seed(random_seed)
        self.model = LinearModel(n_inputs=n_state, n_output=n_action)
        self.w = np.random.rand(n_state, n_action) * 1e-3
        self.load_weights(w=self.w)


    def load_weights(self, w):
        params = OrderedDict()
        params['layer1.weight'] = torch.tensor(w, dtype=torch.float32)
        self.model.load_state_dict(params)


    def select_action(self, state, w):
        self.load_weights(w)
        probs = self.model.forward(state)
        if self.stochastic_policy:
            action = np.random.choice(range(self.n_action), p=probs)
        else:
            action = np.argmax(probs)
        return action