In [3]:
import gym

In [5]:
import numpy as np
import operator
from collections import defaultdict

class MonteCarloAgent():
    def __init__(self, possibleActions,discountFactor, epsilon, initVals=0.0):
        self.possibleActions = possibleActions
        self.discountFactor = discountFactor
        self.epsilon = epsilon
        self.state = []
        self.episode = []
        self.G = initVals
        self.QValues = defaultdict(lambda : {})
        self.policy = defaultdict(float)
        self.returnsSum = defaultdict(float)
        self.returnsCount = defaultdict(float)
        self.numPossActions = len(self.possibleActions)
        

    def getPolicy(self):

        # Generate e-greedy policy
        if self.state in self.QValues.keys():
            for action in self.possibleActions:
                self.policy[action] = self.epsilon / self.numPossActions
            bestAction = max(self.QValues[self.state].items(), key=operator.itemgetter(1))[0] 
            self.policy[bestAction] += (1.0 - self.epsilon)  
        else:
            for action in self.possibleActions:    
                self.policy[action] = 1 / self.numPossActions
                
        #print("Policy to take action : ")
        #for k, v in self.policy.items():
         #   print("action ",k,"Choose prob ",v)


    def toStateRepresentation(self, state):
        return tuple((state[0],state[1]))
    

    def setExperience(self, state, action, reward):
        #Generates an episode with all states visited 
        self.episode.append((state,action,reward))


    def setState(self, state):
        self.state = state    
        
    def reset(self):
        self.state = []
        self.episode = []

    def act(self):
        # Take an action
        self.getPolicy()
        probs = list(self.policy.values())
        actions = list(self.policy.keys())
        action = actions[np.random.choice(np.arange(len(probs)), p=probs)]
        return action
    

    def setEpsilon(self, epsilon):
        self.epsilon = epsilon 

    def computeHyperparameters(self):
        return self.epsilon

    
    def learn(self):  
        
      #  print("Episode: ",self.episode)
        sa_in_episode = set([ tuple((x[0], x[1])) for x in self.episode])
            

        QEstimate = []
        QEstimateDict = {}
        
        # Loop for each pair of state-action in an episode
        for state, action in sa_in_episode:

            sa_pair = (state, action) 
       #     print("State-action pair: ",sa_pair)
            
            # Get the index of the first ocurrence of a pair
            first_occurence_idx = next(i for i,x in enumerate(self.episode) if x[0] == state and x[1] == action)
            
            # Sum the rewards since the first visit to state-action 
            G = sum([x[2]*(self.discountFactor**i) for i,x in enumerate(self.episode[first_occurence_idx:])])
            self.returnsSum[sa_pair] += G
            self.returnsCount[sa_pair] += 1.0
            

         #   print("Numero de visitas : ")
          #  for k, v in self.returnsCount.items():
           #     print(k,v)

            
            # Set the Q values of every state not visited before to zero
            if not state in self.QValues.keys():
                for action in self.possibleActions:
                    self.QValues[state][action] = 0

            self.QValues[state][action] = self.returnsSum[sa_pair] / self.returnsCount[sa_pair]   
            
            
       # print("Funci√≥n de valor Q: ")
       # for k, v in self.QValues.items():
        #    print(k,v)
            
        return self.QValues
    

In [72]:

if __name__ == '__main__':

    env = gym.make('Blackjack-v1')
    space_size = env.action_space.n
    possibleActions = []
    for i in range(space_size):
        possibleActions.append(str(i))
        
    print("Possible actions: ",possibleActions)


    # Initialize a Monte-Carlo Agent
    agent = MonteCarloAgent(possibleActions,discountFactor = 0.999, epsilon = 0.9)

    # Run training Monte Carlo Method
    for episode in range(30):
        print("\n******************************* EPISODE ",episode,"***************************")
        agent.reset()
        observation = env.reset()   # Returns current state
        print("State after reset: ",observation)
       
        done = False
    
        while done==False:
            
            epsilon = agent.computeHyperparameters()
            agent.setEpsilon(epsilon)
            obsCopy = observation   # Copy current state
            agent.setState(agent.toStateRepresentation(obsCopy))
            action = agent.act()

            print("Action: ",action)
            
            nextObservation, reward, done, status = env.step(int(action))

            
            print("Next state: ",nextObservation)
            print("Next reward: ",reward)
            
            print("Episode finished: ",done)
            agent.setExperience(agent.toStateRepresentation(obsCopy), str(action), reward)
            observation = nextObservation
            

        QValues = agent.learn()

   


Possible actions:  ['0', '1']

******************************* EPISODE  0 ***************************
State after reset:  (18, 10, False)
Policy to take action : 
action  0 Choose prob  0.5
action  1 Choose prob  0.5
Action:  1
Next state:  (25, 10, False)
Next reward:  -1.0
Episode finished:  True

******************************* EPISODE  1 ***************************
State after reset:  (12, 7, False)
Policy to take action : 
action  0 Choose prob  0.5
action  1 Choose prob  0.5
Action:  0
Next state:  (12, 7, False)
Next reward:  -1.0
Episode finished:  True

******************************* EPISODE  2 ***************************
State after reset:  (17, 10, False)
Policy to take action : 
action  0 Choose prob  0.5
action  1 Choose prob  0.5
Action:  1
Next state:  (27, 10, False)
Next reward:  -1.0
Episode finished:  True

******************************* EPISODE  3 ***************************
State after reset:  (17, 2, True)
Policy to take action : 
action  0 Choose prob  0.5
acti

In [6]:

if __name__ == '__main__':

    env = gym.make('Blackjack-v1')
    space_size = env.action_space.n
    possibleActions = []
    for i in range(space_size):
        possibleActions.append(str(i))
        
    # Initialize a Monte-Carlo Agent
    agent = MonteCarloAgent(possibleActions,discountFactor = 0.999, epsilon = 0.99)

    j = 0
    cumulativeReward = 0
    # Run training Monte Carlo Method
    for episode in range(50000):
        j += 1
        agent.reset()
        observation = env.reset()   # Returns current state

        done = False
    
        while done==False:
            
            epsilon = agent.computeHyperparameters()
            agent.setEpsilon(epsilon)
            obsCopy = observation   # Copy current state
            agent.setState(agent.toStateRepresentation(obsCopy))
            action = agent.act()

            nextObservation, reward, done, status = env.step(int(action))
            
            cumulativeReward+= reward
            
            
            agent.setExperience(agent.toStateRepresentation(obsCopy), str(action), reward)
            observation = nextObservation
            

        QValues = agent.learn()
        
       

    print("Q value function: ")
    m = 0
    for k, v in sorted(QValues.items()):
        m +=1
        print("item ",m,k,v)
        
    for k, v in sorted(QValues.items()):
        print("state: ",k," action: ",max(v.items(), key=operator.itemgetter(1))[0], " value: ",max(v.items(), key=operator.itemgetter(1))[1]) 
     


Q value function: 
item  1 (4, 1) {'0': -0.8, '1': -0.99858383325}
item  2 (4, 2) {'0': -0.14285714285714285, '1': -0.5707144285714285}
item  3 (4, 3) {'0': -0.75, '1': -0.1998999}
item  4 (4, 4) {'0': -0.75, '1': -0.33286679999999996}
item  5 (4, 5) {'0': 0.0, '1': -0.14278564285714285}
item  6 (4, 6) {'0': 0.38461538461538464, '1': -0.3120625625624999}
item  7 (4, 7) {'0': -0.6, '1': -0.17647035299999997}
item  8 (4, 8) {'0': -0.25, '1': -0.42771492828578567}
item  9 (4, 9) {'0': -0.4444444444444444, '1': -0.6352731818181819}
item  10 (4, 10) {'0': -0.5609756097560976, '1': -0.7547325850001215}
item  11 (5, 1) {'0': -0.9047619047619048, '1': -0.6833160525789475}
item  12 (5, 2) {'0': -0.3333333333333333, '1': -0.39945034975005}
item  13 (5, 3) {'0': -0.21428571428571427, '1': -0.3041302175217392}
item  14 (5, 4) {'0': 0.03225806451612903, '1': -0.124916625}
item  15 (5, 5) {'0': 0.08333333333333333, '1': -0.30408704330439124}
item  16 (5, 6) {'0': -0.09090909090909091, '1': 0.0713571