# Gambler’s Problem
- A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. 
- If the coin comes up heads, gambler wins as many dollars as the stake on that flip
- If the coin comes up tails, gambler loses the stake.
- The game ends when the gambler wins by reaching goal of $100 or loses by running out of money
- On each flip, the gambler must decide what portion of capital to stake, in integer numbers of dollars.

## Object
- Bet and win until capital = $100

### Problem Domain Analysis
 - Class Gambler: Environment
 - Game is modeled not built: Using either dynamics of P: S x A x S' x R  or experience through Monte Carlo Methods.
 - Game can be built out but is unnecessary for this problem set.
 - Compute max for V(s) and arg max a for deterministic policy Pi*

In [38]:
import random

In [39]:
class Gambler:
    
    def __init__(self, probability):
        self.probability = probability
        self.win_money = 1
        self.lose_money = 1
        self.goal = 100
        self.reward = 0
        self.policy, self.Returns, self.Return, self.Q = self.create()
        self.V = [0.0 for i in range(self.goal+1)]
        self.episode = []
        self.converged = False
        self.delta = 0
        self.theta = 0.00000001
        self.v = 0
        self.gamma = 1.0
        
    def create(self):
        """
        Creates arbitrary policy and returns list
        """
        
        # Empty structures for policy
        policy = {}; returns = {}; r = {}; q = {}
        
        # Create policy and returns
        for i in range(1, self.goal):
   
            # Policy - Key: states, Value: actions
            policy[i] = [min(s, self.goal - s) for s in range(i+1)]
            
            # Returns - Key: states, Value: empty list for all s of S
            returns[i] = []
            
            # Create states for Q(s,a)
            q[i] = [min(s, self.goal - s) for s in range(i+1)]
            
        # Create Q(s, a) and Returns(s, a)
        for state in policy:
            
            # Actions
            for action in policy[state]:
            
                # Q(s,a) - Key: states, actions | Value: R 
                q[state][action] = 0.0

                # Returns - Key: states, Value: empty list for all s of S
                r[(state, action)] = []
        
        
        return policy, returns, r, q
        
    def value_iteration(self):
        """
        Output a deterministic policy pi = pi*
        """
        
        ##############
        # Initialize #
        ##############
        
        # State value function
        V = self.V.copy()
        
        # TERMINAL STATES
        V[0] = V[-1] = 0
        
        # Arbitrary Policy
        policy = self.policy.copy()
        
        #############################
        # Loop until delta < theta: #
        #############################
        while not self.converged:
            
            # delta <- 0
            self.delta = 0
            
            ########################
            # loop for each s of S #
            ########################
            for state in self.policy:
               
                # keep track of actions, values
                actions = {}
                
                # v <- V (s)
                self.v = V[state]
                
                ##############
                # GAME MODEL #
                ##############
                for action in self.policy[state]:
                    
                    # keep track of look ahead values
                    self.win_money = state + action; self.lose_money = state - action
                    
                    # No reward during game or loss
                    self.reward = r = 0
                    
                    # Reward for WIN
                    if self.win_money >= self.goal:
                        
                        # Set value for V(s) index 
                        self.win_money = self.goal
                        
                        # Reward
                        self.reward = 1
                    
                    # lose money
                    if self.lose_money <= 0:
                        
                        # Set value for V(s) index
                        self.lose_money = 0
                    
                    
                    #############################################
                    # V(s) max_a p(s', r|s, a)[r + Gamma*V(s')] #
                    #############################################
                    value = self.probability*(self.reward + self.gamma*(V[self.win_money])) + (1-self.probability)*(r + self.gamma*(V[self.lose_money]))

                    # Track action values
                    actions[action] = value
                
                # place the max value in state
                V[state] = max(actions.values()) 
                
                #######################################################
                # Pi(s) = argmax_a SUM p(s', r|s, a)[r + Gamma*V(s')] #
                #######################################################
                policy[state] = [action for action, value in actions.items() if value == max(actions.values())]
                
                ###################################
                # delta <- max(delta, |v - V(s)|) #
                ###################################
                self.delta = max(self.delta, abs(self.v - V[state]))
                             
            # keep track of states
            self.V = V.copy()
            
            #######################
            # until delta < theta #
            #######################
            if self.delta < self.theta:

                # {Vk} converged to v*
                self.converged = True
        
        ###########################################
        # Output a deterministic policy, Pi = pi* #
        ###########################################
        self.policy = policy
        return policy
    
    def first_visit(self, episodes):
        """
        Returns Monte Carlo First Visit Prediction V = v*
        """
        
        #########################
        # Loop For Each Episode #
        #########################
        for i in range(episodes):
            
            #####################################################################
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT #
            #####################################################################
            self.episode = [] 
            
            # G <- 0
            G = 0
            
            ######################
            # GAME MODEL EPISODE #
            ######################
            
            # loop for each s of S
            for state in self.policy:
            
                # action in each state
                for action in self.policy[state]:
                    
                    # keep track of win lose values
                    self.win_money = state + action; self.lose_money = state - action
                    
                    # No reward during game or loss
                    self.reward = 0
                    
                    # Reward for WIN
                    if self.win_money >= self.goal:
                        
                        # Set value for index
                        self.win_money = self.goal
                        
                        # Reward
                        self.reward = 1
                    
                    # lose money
                    if self.lose_money <= 0:
                        
                        # Set value for index
                        self.lose_money = 0
                        
                    # Episode
                    self.episode.append((state, action, self.reward))
                    
            ########################################################
            # Loop for each step of episode, t = T-1, T-2, ..., 0: #
            ########################################################
            for s, a, r in self.episode:
                
                # G <- Gamma*G + Rt+1
                G = self.gamma*G + r
              
                # Unless St appears in S0, S1,..., St-1:
                if s not in self.Returns:
                    
                    # Append G to Returns(St)
                    self.Returns[s].append(G)

                    # V(St) <- average(Returns(St))
                    self.V[s] = sum(self.Returns[s])/len(self.Returns[s])

        # V = v_pi
        return self.V

        
    def exploration_starts(self, episodes):
        """
        Returns Monte Carlo Exploration Starts Pi = pi*
        """
        
        # Policy pi
        policy = self.policy.copy()
        
        # States
        states = list(self.policy.keys())
        
        #########################
        # Loop For Each Episode #
        #########################
        for i in range(episodes):
            
            # Episode history
            episode = [] 
            
            #####################################################################
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT #
            #####################################################################
            
            # G <- 0
            G = 0
            
            ######################
            # GAME MODEL EPISODE #
            ######################
            
            # Exploration starts
            state = random.choice(states)

            # loop for episode
            while self.win_money < self.goal and self.lose_money > 0:
                
                # Random action in each state
                action = random.choice(self.policy[state])
                
                # keep track of state - action values
                self.win_money = state + action; self.lose_money = state - action

                # No reward during game or loss
                self.reward = 0

                # Reward for WIN
                if self.win_money >= self.goal:

                    # Set value for index
                    self.win_money = self.goal

                    # Reward
                    self.reward = 1

                # lose money
                if self.lose_money <= 0:

                    # Set value for index
                    self.lose_money = 0
                    
                    # consequence 
                    self.reward = -1

                # Episode
                episode.append((state, action, self.reward))
                    
            ########################################################
            # Loop for each step of episode, t = T-1, T-2, ..., 0: #
            ########################################################
            for s, a, r in episode:
                
                # G <- Gamma*G + Rt+1
                G = self.gamma*G + r
              
                # Unless St appears in S0, S1,..., St-1:
                if s not in self.Return:
                    
                    # Append G to Returns(St, At)
                    self.Return[(s, a)].append(G)

                    # Q(St,At) <- average(Returns(St, At))
                    self.Q[s][a] = sum(self.Return[(s, a)])/len(self.Return[(s,a)])
                    
                    # Pi(s) = argmax_a Q(St, a)
                    policy[s] = [max(self.Q[s])]

        # Pi = pi*
        self.policy = policy
        return policy


In [40]:
if __name__ == "__main__":
    g = Gambler(0.4)
    
    print("Value Iteration")
    print(g.value_iteration())
    print()
    print("MC First Visit")
    print(g.first_visit(10000))
    print()
    print("MC Exploration Starts")
    print(g.exploration_starts(10000))


Value Iteration
{1: [0, 1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [0, 7], 8: [8], 9: [9], 10: [10], 11: [0, 11], 12: [12], 13: [0, 12, 13], 14: [11, 14], 15: [10, 15], 16: [0, 16], 17: [8, 17], 18: [0, 7, 18], 19: [0, 6, 19], 20: [5, 20], 21: [4], 22: [0, 3, 22], 23: [2], 24: [1, 24], 25: [0, 25], 26: [1, 24, 26], 27: [2, 23, 27], 28: [3], 29: [4], 30: [5, 20, 30], 31: [6, 19], 32: [7, 18], 33: [0, 17, 33], 34: [0, 9, 16, 34], 35: [10, 15], 36: [0, 11, 14], 37: [12, 13, 37], 38: [12], 39: [11], 40: [10], 41: [9], 42: [0, 8, 42], 43: [0, 7, 43], 44: [6, 44], 45: [5], 46: [0, 4, 46], 47: [3, 47], 48: [0, 2, 48], 49: [0, 1], 50: [0, 50], 51: [1, 49], 52: [2, 48], 53: [3], 54: [4, 46], 55: [5, 45], 56: [6, 44], 57: [43], 58: [0, 8], 59: [0, 9, 41], 60: [40], 61: [0, 39], 62: [12, 38], 63: [37], 64: [36], 65: [35], 66: [34], 67: [0, 8, 17, 33], 68: [0, 7, 18, 32], 69: [6, 19, 31], 70: [5, 30], 71: [0, 4, 21, 29], 72: [3, 22, 28], 73: [0, 2, 23, 27], 74: [0, 26], 75: [0, 25], 76: [1, 24