In [1]:
import random

In [5]:
#this code initializes the environment's internal state.
class Environment:
    #in this case, the state is a counter of 10 steps where the agent is allowed to interact with the environment
    def __init__(self):
        self.steps_left = 10
        
    # this returns the current observation to the agent. in this case, observation vector is 0 which means no internal state
    def get_observation(self):
        return [0.0,0.0,0.0]
    
    # this allows the agent to query a set of actions it can excecute. In this case, we only have 2 actions that the 
    # agent is allowed to take which are encode with 0 & 1
    def get_action(self):
        return [0,1]
    
    #signal the end of an episode to the agent
    def is_done(self):
        return self.steps_left == 0
    
    # this is the central piece in the environment's class (functionality). 
    # does 2 things: 1st: handles the agent's actions & 2nd: returns rewards which in this case is random.
    def action (self, action):
        if self.is_done():
            raise Exception("Game is over")
        self.steps_left -= 1
        return random.random()

In [9]:
# actor class
class Agent:
    
    #this initialize a counter for the accumulative reward of the agent
    def __init__ (self):
        self.total_reward = 0.0
        
    #this function is central for the agent which takes in the environment as an argument.
    # it does 4 things: 1st: observe the environment, 2nd: make decision based on the observation
    # 3rd: submit decision, 4th: get reward back.
    def step (self, env):
        current_obs = env.get_observation()
        actions = env.get_action()
        reward = env.action(random.choice(actions))
        self.total_reward += reward
        
    
        

In [16]:
if __name__ == "__main__":
    env = Environment()
    agent = Agent()
    
    while not env.is_done():
        agent.step(env)
        
    print ("Total reward got: %.4f" % agent.total_reward)

Total reward got: 4.8411
