In [3]:
from environment import *
import numpy as np

Control using monte carlo:

### Variables

**__Value Function__** <br>
__Q__ ==> Keeps track of value of each state/action pair, it is a Matrix of size 10x21x2 <br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;10 -- No. of possible states of dealer. (one card is face up)<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;21 -- No. of possible states of player <br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;2  -- No. of possible actions <br>

*__Step Size__* 

$\alpha = \frac{1}{N(s,a)}$ <br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;N(s,a)   ==> Number of times 'a' was taken when in state 's'.  <br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;|N(s,a)| ==> 10x21x2  <br>

$\epsilon = \frac{N_0}{N_0 - N(s_t)}$

&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;$N_0$     ==> Constant value ~100 <br> 
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;N($s_t$)  ==> Number of times state $s_t$ has been visited.

### *Action Function*

__Parameters__: state 

Chooses action based on Epsilon value:
With $\epsilon$ probability it takes a random action i.e hit or stick (prob of picking either action randomly is 0.5. <br>
With 1-$\epsilon$ probability we pick action greedily i.e action corresponding to max Q value. <br>


### *Training Agent*

__Parameters__: iterations

Initialize start state.

For each episode :<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; Keep track of state,action pair, reward and number of times s,a pair were visited.<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; For each state action pair update Q table. <br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; $error = r - Q_prev $<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; $Q =: Q + error*stepsize$ <br>
                


In [48]:
class MC_Agent:
    def __init__(self,env,n0):
        self.value = 0
        self.n0 = float(n0)
        self.env = env
        self.N = np.zeros((env.dealer_values_count, env.player_values_count, env.actions_count))
        self.Q = np.zeros((env.dealer_values_count, env.player_values_count, env.actions_count))
        self.V = np.zeros((env.dealer_values_count, env.player_values_count))
         
       
        
        self.count_wins = 0
        self.iterations = 0
        
    def get_action(self,state):
        dealer_idx = state.dealer - 1
        player_idx = state.player - 1
        
        n_visits = sum((self.N[dealer_idx, player_idx,:]))
        epsilon = self.n0/(self.n0 + n_visits)
        
        if random.random() < epsilon:
            action = Actions.hit if random.random()<0.5 else Actions.stick
            return action
        else:
            action = Actions.to_action(np.argmax(self.Q[dealer_idx,player_idx,:]))

            return action
    
    def train(self, iterations):
        
        episode_pairs = []    
        for epsiode in range(iterations):
            s = self.env.get_start_state()
            while not s.term:
                a = self.get_action(s)
                episode_pairs.append((s,a))
                self.N[s.dealer-1,s.player-1,Actions.as_int(a)] += 1
                
                s,r = self.env.step(s, a)
                
            for curr_s,curr_a in episode_pairs:
                dealer_idx = curr_s.dealer - 1
                player_idx = curr_s.player - 1
                action_idx = Actions.as_int(curr_a)
                
                step    =  1/self.N[dealer_idx, player_idx, action_idx]
                error   =  r - self.Q[dealer_idx, player_idx, action_idx]
                self.Q +=  step*error
                
        self.iterations += iterations
        print (float(self.count_wins)/self.iterations*100)

        # Derive value function
        for d in range(self.env.dealer_values_count):
            for p in range(self.env.player_values_count):
                self.V[d,p] = max(self.Q[d, p, :])
                
    def plot_frame(self, ax):
        def get_stat_val(x, y):
            return self.V[x, y]

        X = np.arange(0, self.env.dealer_values_count, 1)
        Y = np.arange(0, self.env.player_values_count, 1)
        X, Y = np.meshgrid(X, Y)
        Z = get_stat_val(X, Y)
        surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
        return surf
                
        

In [49]:
n0 = 100
agent = MC_Agent(Environment(), n0)
for i in range (10):
    print(i)
    agent.train(5)

0
0.0
1
0.0
2
0.0
3
0.0
4
0.0
5
0.0
6
0.0
7
0.0
8
0.0
9
0.0
