#### Episode

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym import spaces
import matplotlib.colors as mcolors
import seaborn as sns
from scipy import stats

In [None]:
class BlackJackEnv(gym.Env):

    metadata = {'render.modes':['human']}

    def __init__(self):
        self.observation_space = spaces.Discrete(580)
        self.action_space = spaces.Discrete(2)
        self.step_count = 0                        ### Number of actions taken in the game till now
        self.double_count = 0


    def check_usable_ace(self,hand):

        '''
        input  - player or dealer's hand
        output - True or False, depending on if the hand has usable ace or not

        function to check if any hand has an usable ace.
        if yes, then replaces that 1 with 11 in the hand as the 1 is counted as 11 in this case.

        '''
            ### Creating a temporary hand taking the Ace's value as 11 to check of usability
        temp_hand = hand.copy()

        ### Checking if the hand has any ace, if not then returns False
        if np.any(temp_hand == 1):

            ### If the hand has any ace then replace the ace(1) with 11 in the temporary hand,
            ### if there are more than one ace then replaces the first ace(1) with 11

            temp_hand[np.where(temp_hand == 1)[0][0]] = 11

            ### After replacement if sum is less than equal to 21, then the ace is usable
            if temp_hand.sum() <= 21:
                return True

        return False

    def use_ace(self,hand):
        '''
        input - player or dealer's hand
        output - new hand with ace(1) replaced with 11
        '''
        temp_hand = hand.copy()
        temp_hand[np.where(temp_hand == 1)[0][0]] = 11
        return temp_hand


    def reset(self):

        '''
        function to reset the environment's state
        '''
        self.double_count = 0
        distr = [1/13] * 9 + [4/13]
        ### New Player Hand
        self.current_hand = np.random.choice(range(1, 11), 2, p=distr)

        ### Initialising Usable Ace as False

        self.usable_ace = False

        ### Variable is used to inform whether the dealer has sticked,
        ### Used to know when to terminate the game

        self.dealer_stick = False
        self.player_stick = False


        ### Checking if player hand has Usable Ace, if yes, then replacing it with 11.
        if self.check_usable_ace(self.current_hand):
            self.usable_ace = True
            self.current_hand = self.use_ace(self.current_hand)

        ### State variable Current Sum
        self.current_sum = self.current_hand.sum()

        ### Dealer's New Hand
        self.dealer_hand = np.random.choice(range(1, 11), 2, p=distr)

        ### Dealer's Sum
        self.dealer_sum = self.dealer_hand.sum()

        ### State Variable: Dealer Showing Card
        self.dealer_showing_card = self.dealer_hand[0]

        ### Checking if Dealer's hand has Usable Ace, if yes, then replacing it with 11.
        if self.check_usable_ace(self.dealer_hand):
            temp_dealer_hand = self.use_ace(self.dealer_hand)
            self.dealer_sum = temp_dealer_hand.sum()


    def take_turn(self,player):

        '''
        Used to play one turn of the player, Called from "step()" funtion, depending upon the game state
        player can have two values - 'player' or 'dealer'

        For each type of player it does the same thing, just changing different variables

        It takes a new random card, adds to the players hand. If there is usable ace,
        then replaces ace(1) with 11.

        '''

        if player == 'dealer':
            distr = [1/13] * 9 + [4/13]
            ### takes a new random card
            new_card = np.random.choice(range(1, 11), p=distr)

            ### adding new card to the players hand and making a temporary new hand
            new_dealer_hand = np.array(self.dealer_hand.tolist() +  [new_card])

            ### Check if there is usable ace
            if self.check_usable_ace(new_dealer_hand):

                ### replace ace(1) with 11
                new_dealer_hand = self.use_ace(new_dealer_hand)

            ### Assigning the temporary hand to the players actual hand
            self.dealer_hand = new_dealer_hand

            ### Updating the players hand sum variable
            self.dealer_sum = self.dealer_hand.sum()

        if player == 'player':
            distr = [1/13] * 9 + [4/13]
            ### takes a new random card
            new_card = np.random.choice(range(1, 11), p=distr)

            ### adding new card to the players hand and making a temporary new hand
            new_player_hand = np.array(self.current_hand.tolist()+ [new_card])

            ### Check if there is usable ace
            if self.check_usable_ace(new_player_hand):

                ### replace ace(1) with 11
                self.usable_ace = True
                new_player_hand = self.use_ace(new_player_hand)

            ### Assigning the temporary hand to the players actual hand
            self.current_hand = new_player_hand
            ### Updating the players hand sum variable
            self.current_sum = self.current_hand.sum()


    def check_game_status(self, mode = 'normal', dd = 0):

        '''
         checks the status of the game, there are two modes
         'normal' mode - the default mode, this is used to check after
                         each turn whether a terminal state has been reached
         'compare' mode - used when we need to compare the totals of both the players
                          to judge the winner


         returns a result dictionary with the winner, whether the game is finished
         and the reward of the game
        '''
        result = {'winner':'',
                 'is_done': False,
                 'reward':0}
        dupla = 0
        if dd == 0:
          dupla = 1
        else:
          dupla = 2 * dd

        if mode == 'normal':

            if self.current_sum > 21:
                result['winner'] = 'dealer'
                result['is_done'] = True
                result['reward'] = -1 * dupla
            elif self.dealer_sum > 21:
                result['winner'] = 'player'
                result['is_done'] = True
                result['reward'] = 1 * dupla

            elif self.current_sum == 21:
                result['winner'] = 'player'
                result['is_done'] = True
                result['reward'] = 1 * dupla

            elif self.dealer_sum == 21:
                result['winner'] = 'dealer'
                result['is_done'] = True
                result['reward'] = -1 * dupla

        elif mode == 'compare':

            result['is_done'] = True
            diff_21_player = 21 - self.current_sum
            diff_21_dealer = 21 - self.dealer_sum

            if diff_21_player > diff_21_dealer:
                result['reward'] = -1 * dupla
                result['winner'] = 'dealer'
            elif diff_21_player < diff_21_dealer:
                result['reward'] = 1 * dupla
                result['winner'] = 'player'
            else:
                result['reward'] = 0
                result['winner'] = 'draw'

            return result


        return result

    def step(self,action):

        '''
        Performs one action, either Hit or Stick

        returns - a result dictionary with the winner, whether the game is finished
        and the reward of the game

        '''

        self.step_count += 1  ### Number of actions taken in the game till now

        result = {'winner':'',
                 'is_done': False,
                 'reward':0}

        ### Before taking the first step of the game we need to check for "natural"
        ### winning condition if the initial two cards of the players are 21
        ### If anyone has 21, then that player wins, if both have 21, then the game is
        ### drawn. Otherwise the game will continue

        if self.step_count == 1:
            if self.check_usable_ace(self.current_hand):
                self.current_hand = self.use_ace(self.current_hand)
            if self.check_usable_ace(self.dealer_hand):
                self.current_hand = self.use_ace(self.dealer_hand)

            if self.current_sum == 21 and self.dealer_sum == 21:
                result['is_done'] = True
                result['reward'] = 0
                result['winner'] = 'draw'
                return result

            elif self.current_sum == 21 and self.dealer_sum < 21:
                result['is_done'] = True
                result['reward'] = 1
                result['winner'] = 'player'
                return result

            elif self.dealer_sum == 21 and self.current_sum < 21:
                result['is_done'] = True
                result['reward'] = -1
                result['winner'] = 'dealer'
                return result

            if self.dealer_sum >= 17:
                self.dealer_stick = True

        ### action = 0, meaning "hit"

        if action == 0:

            ### Player Takes Turn
            self.take_turn('player')

            ### Checking game status
            result = self.check_game_status(dd = self.double_count)
            if result['is_done'] == True:
                return result



        if action == 1:  ### stick

            if self.dealer_stick == True:  ### if dealer has already sticked
                  return self.check_game_status(mode = 'compare', dd = self.double_count)

            ### if dealer has not sticked, the dealer hits unless his sum >= 17, then he sticks

            ### Dealers Turn
            while self.dealer_sum < 17:

                self.take_turn('dealer')
                result = self.check_game_status(dd = self.double_count)
                #game_status = result['is_done']    ### To check game status after Dealer sticks
                if result['is_done'] == True:
                    return result

            ### if the game is not finished yet, then we set dealer_stick status to True
            ### This means that our player can hit or stick


            self.dealer_stick = True

        if action ==2: ### double down

            self.double_count += 1
            ### Player Takes Turn
            self.take_turn('player')

            ### Checking game status
            result = self.check_game_status(dd = self.double_count)
            if result['is_done'] == True:
                return result

        return result


    def get_current_state(self):
        '''
        returns the current state variables, current_sum, dealer_showing_card, usable_ace
        '''
        current_state = {}

        current_state['current_sum'] = self.current_sum
        current_state['dealer_showing_card'] = self.dealer_showing_card
        current_state['usable_ace'] = self.usable_ace
        current_state['double_count'] = self.double_count

        return current_state


    def render(self):

        print('OBSERVABLE STATES')
        print('Current Sum - {}'.format(self.current_sum))
        print('Dealer Showing Card - {}'.format(self.dealer_showing_card))
        print('Usable Ace - {}'.format(self.usable_ace))
        print('Double Count - {}'.format(self.double_count))

        print('AUXILLARY INFORMATION ------------------------------')
        print('Current Hand - {}'.format(self.current_hand))
        print('Dealer Hand - {}'.format(self.dealer_hand))
        print('Dealer Sum - {}'.format(self.dealer_sum))

In [None]:
bj = BlackJackEnv() #a double down miatt mindig újra kell ezt indítani

In [None]:
bj.reset()
bj.render()

In [None]:
print(bj.step(1))
bj.render()

In [None]:
print(bj.step(1)) ### Action = Stick,  Once the player "Sticks", the dealer hits unitil its sum < 17, after that it stops
bj.render()

In [None]:
print(bj.step(1)) ### Action = Stick
bj.render()

#### Stochastic

In [None]:
class BlackJackEnv_Stochastic(gym.Env):

    metadata = {'render.modes':['human']}

    def __init__(self):
        self.observation_space = spaces.Discrete(580)
        self.action_space = spaces.Discrete(2)
        self.step_count = 0                        ### Number of actions taken in the game till now
        self.double_count = 1


    def check_usable_ace(self,hand):

        '''
        input  - player or dealer's hand
        output - True or False, depending on if the hand has usable ace or not

        function to check if any hand has an usable ace.
        if yes, then replaces that 1 with 11 in the hand as the 1 is counted as 11 in this case.

        '''
            ### Creating a temporary hand taking the Ace's value as 11 to check of usability
        temp_hand = hand.copy()

        ### Checking if the hand has any ace, if not then returns False
        if np.any(temp_hand == 1):

            ### If the hand has any ace then replace the ace(1) with 11 in the temporary hand,
            ### if there are more than one ace then replaces the first ace(1) with 11

            temp_hand[np.where(temp_hand == 1)[0][0]] = 11

            ### After replacement if sum is less than equal to 21, then the ace is usable
            if temp_hand.sum() <= 21:
                return True

        return False

    def use_ace(self,hand):
        '''
        input - player or dealer's hand
        output - new hand with ace(1) replaced with 11
        '''
        temp_hand = hand.copy()
        temp_hand[np.where(temp_hand == 1)[0][0]] = 11
        return temp_hand


    def reset(self):

        '''
        function to reset the environment's state
        '''
        distr = [1/13] * 9 + [4/13]
        ### New Player Hand
        self.current_hand = np.random.choice(range(1, 11), 2, p=distr)

        ### Initialising Usable Ace as False

        self.usable_ace = False

        ### Variable is used to inform whether the dealer has sticked,
        ### Used to know when to terminate the game

        self.dealer_stick = False
        self.player_stick = False


        ### Checking if player hand has Usable Ace, if yes, then replacing it with 11.
        if self.check_usable_ace(self.current_hand):
            self.usable_ace = True
            self.current_hand = self.use_ace(self.current_hand)

        ### State variable Current Sum
        self.current_sum = self.current_hand.sum()

        ### Dealer's New Hand
        self.dealer_hand = np.random.choice(range(1, 11), 2, p=distr)

        ### Dealer's Sum
        self.dealer_sum = self.dealer_hand.sum()

        ### State Variable: Dealer Showing Card
        self.dealer_showing_card = self.dealer_hand[0]

        ### Checking if Dealer's hand has Usable Ace, if yes, then replacing it with 11.
        if self.check_usable_ace(self.dealer_hand):
            temp_dealer_hand = self.use_ace(self.dealer_hand)
            self.dealer_sum = temp_dealer_hand.sum()


    def take_turn(self,player):

        '''
        Used to play one turn of the player, Called from "step()" funtion, depending upon the game state
        player can have two values - 'player' or 'dealer'

        For each type of player it does the same thing, just changing different variables

        It takes a new random card, adds to the players hand. If there is usable ace,
        then replaces ace(1) with 11.

        '''

        if player == 'dealer':
            distr = [1/13] * 9 + [4/13]
            ### takes a new random card
            new_card = np.random.choice(range(1, 11), p=distr)

            ### adding new card to the players hand and making a temporary new hand
            new_dealer_hand = np.array(self.dealer_hand.tolist() +  [new_card])

            ### Check if there is usable ace
            if self.check_usable_ace(new_dealer_hand):

                ### replace ace(1) with 11
                new_dealer_hand = self.use_ace(new_dealer_hand)

            ### Assigning the temporary hand to the players actual hand
            self.dealer_hand = new_dealer_hand

            ### Updating the players hand sum variable
            self.dealer_sum = self.dealer_hand.sum()

        if player == 'player':
            distr = [1/13] * 9 + [4/13]
            ### takes a new random card
            new_card = np.random.choice(range(1, 11), p=distr)

            ### adding new card to the players hand and making a temporary new hand
            new_player_hand = np.array(self.current_hand.tolist()+ [new_card])

            ### Check if there is usable ace
            if self.check_usable_ace(new_player_hand):

                ### replace ace(1) with 11
                self.usable_ace = True
                new_player_hand = self.use_ace(new_player_hand)

            ### Assigning the temporary hand to the players actual hand
            self.current_hand = new_player_hand
            ### Updating the players hand sum variable
            self.current_sum = self.current_hand.sum()


    def check_game_status(self, mode = 'normal', dd = 1):

        '''
         checks the status of the game, there are two modes
         'normal' mode - the default mode, this is used to check after
                         each turn whether a terminal state has been reached
         'compare' mode - used when we need to compare the totals of both the players
                          to judge the winner


         returns a result dictionary with the winner, whether the game is finished
         and the reward of the game
        '''
        result = {'winner':'',
                 'is_done': False,
                 'reward':0}


        if mode == 'normal':

            if self.current_sum > 21:
                result['winner'] = 'dealer'
                result['is_done'] = True
                result['reward'] = -1 * dd
            elif self.dealer_sum > 21:
                result['winner'] = 'player'
                result['is_done'] = True
                result['reward'] = 1 * dd

            elif self.current_sum == 21:
                result['winner'] = 'player'
                result['is_done'] = True
                result['reward'] = 1 * dd

            elif self.dealer_sum == 21:
                result['winner'] = 'dealer'
                result['is_done'] = True
                result['reward'] = -1 * dd

        elif mode == 'compare':

            result['is_done'] = True
            diff_21_player = 21 - self.current_sum
            diff_21_dealer = 21 - self.dealer_sum

            if diff_21_player > diff_21_dealer:
                result['reward'] = -1 * dd
                result['winner'] = 'dealer'
            elif diff_21_player < diff_21_dealer:
                result['reward'] = 1 * dd
                result['winner'] = 'player'
            else:
                result['reward'] = 0
                result['winner'] = 'draw'

            return result


        return result

    def step(self,action_intended):

        '''
        Performs one action, either Hit or Stick

        returns - a result dictionary with the winner, whether the game is finished
        and the reward of the game

        '''

        self.step_count += 1  ### Number of actions taken in the game till now


        result = {'winner':'',
                 'is_done': False,
                 'reward':0}

        ### Before taking the first step of the game we need to check for "natural"
        ### winning condition if the initial two cards of the players are 21
        ### If anyone has 21, then that player wins, if both have 21, then the game is
        ### drawn. Otherwise the game will continue

        if self.step_count == 1:
            if self.check_usable_ace(self.current_hand):
                self.current_hand = self.use_ace(self.current_hand)
            if self.check_usable_ace(self.dealer_hand):
                self.current_hand = self.use_ace(self.dealer_hand)

            if self.current_sum == 21 and self.dealer_sum == 21:
                result['is_done'] = True
                result['reward'] = 0
                result['winner'] = 'draw'
                return result

            elif self.current_sum == 21 and self.dealer_sum < 21:
                result['is_done'] = True
                result['reward'] = 1
                result['winner'] = 'player'
                return result

            elif self.dealer_sum == 21 and self.current_sum < 21:
                result['is_done'] = True
                result['reward'] = -1
                result['winner'] = 'dealer'
                return result

            if self.dealer_sum >= 17:
                self.dealer_stick = True


        #### Adding Stochastic Behaviour

        p = []
        if action_intended == 0:
            p = [0.90, 0.10]
        else:
             p = [0.10, 0.90]

        #### If intended action is 0(hit), then choosing that action with 90% probability
        #### If intended action is 1(stick), then choosing that action with 90% probability

        action = np.random.choice([0,1], p = p)



        ### action = 0, meaning "hit"

        if action == 0:

            ### Player Takes Turn
            self.take_turn('player')

            ### Checking game status
            result = self.check_game_status(dd = self.double_count)
            if result['is_done'] == True:
                return result



        if action == 1:  ### stick

            if self.dealer_stick == True:  ### if dealer has already sticked
                  return self.check_game_status(mode = 'compare', dd = self.double_count)

            ### if dealer has not sticked, the dealer hits unless his sum >= 17, then he sticks

            ### Dealers Turn
            while self.dealer_sum < 17:

                self.take_turn('dealer')
                result = self.check_game_status(dd = self.double_count)
                #game_status = result['is_done']    ### To check game status after Dealer sticks
                if result['is_done'] == True:
                    return result

            ### if the game is not finished yet, then we set dealer_stick status to True
            ### This means that our player can hit or stick


            self.dealer_stick = True

        if action ==2: ### double down

            self.double_count = self.double_count * 2
            ### Player Takes Turn
            self.take_turn('player')

            ### Checking game status
            result = self.check_game_status(dd = self.double_count)
            if result['is_done'] == True:
                return result

        return result


    def get_current_state(self):
        '''
        returns the current state variables, current_sum, dealer_showing_card, usable_ace
        '''
        current_state = {}

        current_state['current_sum'] = self.current_sum
        current_state['dealer_showing_card'] = self.dealer_showing_card
        current_state['usable_ace'] = self.usable_ace

        return current_state


    def render(self):

        print('OBSERVABLE STATES')
        print('Current Sum - {}'.format(self.current_sum))
        print('Dealer Showing Card - {}'.format(self.dealer_showing_card))
        print('Usable Ace - {}'.format(self.usable_ace))

        print('AUXILLARY INFORMATION ------------------------------')
        print('Current Hand - {}'.format(self.current_hand))
        print('Dealer Hand - {}'.format(self.dealer_hand))
        print('Dealer Sum - {}'.format(self.dealer_sum))



In [None]:
bjs = BlackJackEnv_Stochastic()

In [None]:
bjs.reset()
bjs.render()


In [None]:
bjs.step(2) ### The action intended was 1(stick), but the change in states show that the action taken was 0(hit)
bjs.render()

In [None]:
bjs.step(1) ### The action intended was 1(stick), and the player states show us that the action was infact 1(stick),
            ### as there is no change in the player sum
bjs.render()

#### Q-Learning

In [None]:

#### following are 4 dictionaries which help in converting the
#### state values like current_sum and action to indexes in the Q value table


current_sum_to_index = dict(zip(np.arange(4,33),np.arange(29)))
dealer_showing_card_to_index = dict(zip(np.arange(1,11),np.arange(10)))
usable_ace_index = dict(zip([False,True],[0,1]))
double_count_index = dict(zip(np.arange(0,10),np.arange(10)))
action_index = dict(zip(['hit','stick', 'double_down'],[0,1,2]))

def get_state_q_indices(current_state):

    '''
    used to get indices of the Q table for any given state

    '''
    current_sum_idx = current_sum_to_index[current_state['current_sum']]
    dealer_showing_card_idx = dealer_showing_card_to_index[current_state['dealer_showing_card']]
    usable_ace_idx = usable_ace_index[current_state['usable_ace']]
    double_count_idx = double_count_index[current_state['double_count']]

    return [current_sum_idx,dealer_showing_card_idx,usable_ace_idx,double_count_idx]

def get_max_action(Q_sa, current_state):

    '''
    used to get the action with the max q-value given the current state and the Q table

    '''

    state_q_idxs = get_state_q_indices(current_state)
    action = Q_sa[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],:].argmax() ## stick = 0, hit = 1
    ### if q value of both equal returns stick

    ### can make it epsilon greedy

    return action

def get_q_value(Q_sa, state, action):
    '''
    used to get Q value for any given state and action, given the Q table

    '''
    state_q_idxs = get_state_q_indices(state)
    q_value = Q_sa[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],action]

    return q_value

In [None]:
### first dimension  - current sum (4-30)
### second dimension - dealers showing card (1-10)
### third dimension  - usable ace (False,True)
### fourth dimension - number of double downs (0-9)
### fifth dimension - action (hit, stick, double down)

Q = np.zeros((29,10,2,10,3)) #### Initializing the Q value Table with zeros

In [None]:
episode_count = 0
total_episodes = 100000
gamma = 0.9             #### the discount factor
alpha = 0.1             #### learning rate
bj = BlackJackEnv()


while episode_count < total_episodes:


    bj.reset()  ### Initialize S (the environment's starting state)


    current_state = bj.get_current_state()
    current_action = get_max_action(Q, current_state)


    ### Take Action
    step_result = bj.step(current_action)

    next_state = bj.get_current_state()
    next_max_action = get_max_action(Q, next_state)
    immediate_reward = step_result['reward']

    next_state_q_idxs = get_state_q_indices(next_state)

    #### Get Q value for the next state and max action in the next state
    q_max_s_a = get_q_value(Q, next_state, next_max_action)
    #print(immediate_reward)
    td_target = immediate_reward + gamma * q_max_s_a

    #### Getting Q value for the current state and action
    q_current_s_a = get_q_value(Q, current_state, current_action)

    td_error = td_target - q_current_s_a

    state_q_idxs = get_state_q_indices(current_state)

    #### Updating current Q(S,A)
    Q[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],current_action] = q_current_s_a + alpha*td_error

    current_state = next_state  ### S=S'

    if step_result['is_done']:
        episode_count+=1

        if episode_count%10000 == 0:
            print('---------Episode - {} -----------'.format(episode_count))



## Plots

#### Usable Ace = False

In [None]:
Q_adjusted = Q[:17, :, :, :]
fig, ax = plt.subplots(ncols= 2,figsize=(12,6))
sns.heatmap(Q_adjusted[:,:,0,0],cmap = sns.light_palette((210, 90, 60), input="husl"), ax = ax[0],
            xticklabels=np.arange(1,11),yticklabels=np.arange(4,22))
ax[0].set_title('Usable Ace = False, Action = Hit')
ax[0].set_xlabel('Dealer Showing Card')
ax[0].set_ylabel('Current Player Sum')


sns.heatmap(Q_adjusted[:,:,0,1],cmap = sns.light_palette((210, 90, 60), input="husl"), ax = ax[1],
           xticklabels=np.arange(1,11),yticklabels=np.arange(4,22))
ax[1].set_title('Usable Ace = False, Action = Stick')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')

"""sns.heatmap(Q_adjusted[:,:,0,2],cmap = sns.light_palette((210, 90, 60), input="husl"), ax = ax[2],
           xticklabels=np.arange(1,11),yticklabels=np.arange(4,22))
ax[1].set_title('Usable Ace = False, Action = Stick')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')"""

#### Usable Ace = True

In [None]:
fig, ax = plt.subplots(ncols = 2, figsize=(16,8))
sns.heatmap(Q_adjusted[:,:,1,0],cmap = sns.light_palette((210, 90, 60), input="husl"), ax = ax[0],
           xticklabels=np.arange(1,11),yticklabels=np.arange(4,22))
ax[0].set_title('Usable Ace = True, Action = Hit')
ax[0].set_xlabel('Dealer Showing Card')
ax[0].set_ylabel('Current Player Sum')


sns.heatmap(Q_adjusted[:,:,1,1],cmap = sns.light_palette((210, 90, 60), input="husl"), ax =  ax[1],
           xticklabels=np.arange(1,11),yticklabels=np.arange(4,22))
ax[1].set_title('Usable Ace = True, Action = Stick')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')

In [None]:
yticks = np.arange(4, 22)  # 21 inclusive

# Define colors for actions
colors = ["green", "red", "orange"]
cmap = mcolors.ListedColormap(colors)
bounds = [0, 1, 2, 3]
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# Average the Q-values across the double_count dimension
avg_Q = np.mean(Q[:17, :, :, :, :], axis=3)

# Determine the best action for each state from the averaged Q-values
best_action = np.argmax(avg_Q, axis=3)

# Generate the heatmaps for usable ace false and true
fig, ax = plt.subplots(ncols=2, figsize=(16, 8))

# Usable ace = False
sns.heatmap(best_action[:, :, 0], cmap=cmap, norm=norm, linewidths=1,
            xticklabels=np.arange(1, 12), yticklabels=yticks, ax=ax[0])
ax[0].set_title('Best Action Heatmap (Usable Ace = False)')
ax[0].set_xlabel('Dealer Showing Card')
ax[0].set_ylabel('Current Player Sum')

# Usable ace = True
sns.heatmap(best_action[:, :, 1], cmap=cmap, norm=norm, linewidths=1,
            xticklabels=np.arange(1, 12), yticklabels=yticks, ax=ax[1])
ax[1].set_title('Best Action Heatmap (Usable Ace = True)')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')

#### Sarsa

In [None]:
current_sum_to_index = dict(zip(np.arange(4,33),np.arange(29)))
dealer_showing_card_to_index = dict(zip(np.arange(1,11),np.arange(10)))
usable_ace_index = dict(zip([False,True],[0,1]))
double_count_index = dict(zip(np.arange(0,10),np.arange(10)))
action_index = dict(zip(['hit','stick', 'double_down'],[0,1,2]))

def get_state_q_indices(current_state):

    '''
    used to get indices of the Q table for any given state

    '''
    current_sum_idx = current_sum_to_index[current_state['current_sum']]
    dealer_showing_card_idx = dealer_showing_card_to_index[current_state['dealer_showing_card']]
    usable_ace_idx = usable_ace_index[current_state['usable_ace']]
    double_count_idx = double_count_index[current_state['double_count']]

    return [current_sum_idx,dealer_showing_card_idx,usable_ace_idx,double_count_idx]

def get_max_action(Q_sa, current_state):
    '''
    used to get the action with the max q-value given the current state and the Q table
    '''
    state_q_idxs = get_state_q_indices(current_state)
    action = Q_sa[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],:].argmax() ## stick = 0, hit = 1
    ### if q value of both equal returns stick

    ### can make it epsilon greedy

    return action

def get_q_value(Q_sa, state, action):
    '''
    used to get Q value for any given state and action, given the Q table
    '''
    state_q_idxs = get_state_q_indices(state)
    q_value = Q_sa[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],action]

    return q_value

def get_action_epsilon_greedy(Q_sa, current_state, epsilon):
    '''
    Get action using epsilon-greedy policy.
    '''
    if np.random.rand() < epsilon:
        return np.random.choice([0, 1])  # random action
    else:
        return get_max_action(Q_sa, current_state)

In [None]:
Q = np.zeros((29,10,2,10,3)) #### Initializing the Q value Table with zeros
episode_count = 0
total_episodes = 100000
gamma = 0.9             #### the discount factor
alpha = 0.1             #### learning rate
epsilon = 0.1           #### epsilon for epsilon-greedy policy
bj = BlackJackEnv()

while episode_count < total_episodes:
    bj.reset()  ### Initialize S (the environment's starting state)

    current_state = bj.get_current_state()
    current_action = get_action_epsilon_greedy(Q, current_state, epsilon)

    step_result = bj.step(current_action)

    next_state = bj.get_current_state()
    next_action = get_action_epsilon_greedy(Q, next_state, epsilon)
    immediate_reward = step_result['reward']


    q_current_s_a = get_q_value(Q, current_state, current_action)
    q_next_s_a = get_q_value(Q, next_state, next_action)

    td_target = immediate_reward + gamma * q_next_s_a
    td_error = td_target - q_current_s_a

    Q_state_idxs = get_state_q_indices(current_state)
    Q[Q_state_idxs[0], Q_state_idxs[1], Q_state_idxs[2], Q_state_idxs[3], current_action] = q_current_s_a + alpha * td_error

    current_state = next_state  ### S=S'
    current_action = next_action  ### A=A'

    if step_result['is_done']:
        episode_count+=1

        if episode_count%10000 == 0:
            print('---------Episode - {} -----------'.format(episode_count))

In [None]:
Q_adjusted = Q[:17, :, :, :]
yticks = np.arange(4, 22)
fig, ax = plt.subplots(ncols=2, figsize=(16, 8))
sns.heatmap(Q_adjusted[:, :, 0, 0], cmap=sns.light_palette((210, 90, 60), input="husl"), ax=ax[0],
            xticklabels=np.arange(1, 11), yticklabels=yticks)
ax[0].set_title('Usable Ace = False, Action = Hit')
ax[0].set_xlabel('Dealer Showing Card')
ax[0].set_ylabel('Current Player Sum')

sns.heatmap(Q_adjusted[:, :, 0, 1], cmap=sns.light_palette((210, 90, 60), input="husl"), ax=ax[1],
            xticklabels=np.arange(1, 11), yticklabels=yticks)
ax[1].set_title('Usable Ace = False, Action = Stick')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 8))
sns.heatmap(Q_adjusted[:, :, 1, 0], cmap=sns.light_palette((210, 90, 60), input="husl"), ax=ax[0],
            xticklabels=np.arange(1, 11), yticklabels=yticks)
ax[0].set_title('Usable Ace = True, Action = Hit')
ax[0].set_xlabel('Dealer Showing Card')
ax[0].set_ylabel('Current Player Sum')

sns.heatmap(Q_adjusted[:, :, 1, 1], cmap=sns.light_palette((210, 90, 60), input="husl"), ax=ax[1],
            xticklabels=np.arange(1, 11), yticklabels=yticks)
ax[1].set_title('Usable Ace = True, Action = Stick')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')

In [None]:
yticks = np.arange(4, 22)  # 21 inclusive

# Define colors for actions
colors = ["green", "red", "orange"]
cmap = mcolors.ListedColormap(colors)
bounds = [0, 1, 2, 3]
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# Average the Q-values across the double_count dimension
avg_Q = np.mean(Q[:17, :, :, :, :], axis=3)

# Determine the best action for each state from the averaged Q-values
best_action = np.argmax(avg_Q, axis=3)

# Generate the heatmaps for usable ace false and true
fig, ax = plt.subplots(ncols=2, figsize=(16, 8))

# Usable ace = False
sns.heatmap(best_action[:, :, 0], cmap=cmap, norm=norm, linewidths=1,
            xticklabels=np.arange(1, 12), yticklabels=yticks, ax=ax[0])
ax[0].set_title('Best Action Heatmap (Usable Ace = False)')
ax[0].set_xlabel('Dealer Showing Card')
ax[0].set_ylabel('Current Player Sum')

# Usable ace = True
sns.heatmap(best_action[:, :, 1], cmap=cmap, norm=norm, linewidths=1,
            xticklabels=np.arange(1, 12), yticklabels=yticks, ax=ax[1])
ax[1].set_title('Best Action Heatmap (Usable Ace = True)')
ax[1].set_xlabel('Dealer Showing Card')
ax[1].set_ylabel('Current Player Sum')