#### Episode

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym import spaces
import matplotlib.colors as mcolors
import seaborn as sns
import time
from itertools import product

In [None]:
class BlackJackEnv(gym.Env):

    metadata = {'render.modes':['human']}

    def __init__(self):
        self.observation_space = spaces.Discrete(2000000)
        self.action_space = spaces.Discrete(4)
        self.step_count = 0                        ### Number of actions taken in the game till now
        self.double_count1 = 0
        self.double_count2 = 0
        self.stick_happened = False
        self.can_split = 0
        self.done_split = 0
        self.stage = 1
        self.actions = ['hit', 'stick', 'double_down', 'split']


    def check_usable_ace(self,hand):
        ### Creating a temporary hand taking the Ace's value as 11 to check of usability
        temp_hand = hand.copy()

        ### Checking if the hand has any ace, if not then returns False
        if np.any(temp_hand == 1):
            ### If the hand has any ace then replace the ace(1) with 11 in the temporary hand,
            ### if there are more than one ace then replaces the first ace(1) with 11

            temp_hand[np.where(temp_hand == 1)[0][0]] = 11

            ### After replacement if sum is less than equal to 21, then the ace is usable
            if temp_hand.sum() <= 21:
                return True

        return False

    def use_ace(self,hand):
        temp_hand = hand.copy()
        temp_hand[np.where(temp_hand == 1)[0][0]] = 11
        return temp_hand


    def reset(self):
        self.double_count1 = 0
        self.double_count2 = 0
        self.can_split = 0
        self.done_split = 0
        self.stage = 1
        self.stick_happened = False
        distr = [1/13] * 9 + [4/13]
        ### New Player Hand
        self.current_hand1 = np.random.choice(range(1, 11), 2, p=distr)
        self.current_hand2 = np.zeros(1, dtype=int)
        if self.current_hand1[0] == self.current_hand1[1]:
          self.can_split = 1

        ### Initialising Usable Ace as False

        self.usable_ace1 = False
        self.usable_ace2 = False

        ### Variable is used to inform whether the dealer has sticked,
        ### Used to know when to terminate the game

        self.dealer_stick = False


        ### Checking if player hand has Usable Ace, if yes, then replacing it with 11.
        if self.check_usable_ace(self.current_hand1):
            self.usable_ace1 = True
            self.current_hand1 = self.use_ace(self.current_hand1)

        ### State variable Current Sum
        self.current_sum1 = self.current_hand1.sum()
        self.current_sum2 = 0

        ### Dealer's New Hand
        self.dealer_hand = np.random.choice(range(1, 11), 2, p=distr)

        ### Dealer's Sum
        self.dealer_sum = self.dealer_hand.sum()

        ### State Variable: Dealer Showing Card
        self.dealer_showing_card = self.dealer_hand[0]

        ### Checking if Dealer's hand has Usable Ace, if yes, then replacing it with 11.
        if self.check_usable_ace(self.dealer_hand):
            temp_dealer_hand = self.use_ace(self.dealer_hand)
            self.dealer_sum = temp_dealer_hand.sum()


    def take_turn(self,player,hand,number):

        if player == 'dealer':
            distr = [1/13] * 9 + [4/13]
            ### takes a new random card
            new_card = np.random.choice(range(1, 11), p=distr)

            ### adding new card to the players hand and making a temporary new hand
            new_dealer_hand = np.array(hand.tolist() +  [new_card])

            ### Check if there is usable ace
            if self.check_usable_ace(new_dealer_hand):

                ### replace ace(1) with 11
                new_dealer_hand = self.use_ace(new_dealer_hand)

            ### Assigning the temporary hand to the players actual hand
            self.dealer_hand = new_dealer_hand

            ### Updating the players hand sum variable
            self.dealer_sum = self.dealer_hand.sum()

        if player == 'player':
            distr = [1/13] * 9 + [4/13]
            ### takes a new random card
            new_card = np.random.choice(range(1, 11), p=distr)

            ### adding new card to the players hand and making a temporary new hand
            new_player_hand = np.array(hand.tolist()+ [new_card])

            ### Check if there is usable ace
            if self.check_usable_ace(new_player_hand):

                ### replace ace(1) with 11
                self.usable_ace = True
                new_player_hand = self.use_ace(new_player_hand)

            if number == 1:
              self.current_hand1 = new_player_hand
              self.current_sum1 = self.current_hand1.sum()
            elif number == 2:
            ### Assigning the temporary hand to the players actual hand
              self.current_hand2 = new_player_hand
              ### Updating the players hand sum variable
              self.current_sum2 = self.current_hand2.sum()



    def check_game_status(self, dd1 = 0, dd2 = 0, mode = 'final'):

        result = {'winner':'',
                 'is_done1': False,
                 'is_done2': False,
                 'reward':0}

        if mode == 'check':

          if self.current_sum1 >= 21:
            result['is_done1'] = True

          if self.current_sum2 >= 21:
            result['is_done2'] = True

        elif mode == 'final':
          if self.done_split == 0 and self.stage == 3:
            dupla = 0
            if dd1 == 0:
              dupla = 1
            else:
              dupla = 2 * dd1

            if self.current_sum1 > 21 and self.dealer_sum == 21 or self.dealer_sum == 21 and self.current_sum1 < 21 or self.dealer_sum < 21 and self.current_sum1 > 21:
                  result['winner'] = 'dealer'
                  result['is_done1'] = True
                  result['is_done2'] = True
                  result['reward'] = -1 * dupla
            elif self.current_sum1 == 21 and self.dealer_sum > 21 or self.dealer_sum < 21 and self.current_sum1 == 21 or self.dealer_sum > 21 and self.current_sum1 < 21:
                  result['winner'] = 'player'
                  result['is_done1'] = True
                  result['is_done2'] = True
                  result['reward'] = 1 * dupla

            elif self.current_sum1 == 21 and self.dealer_sum == 21 or self.dealer_sum > 21 and self.current_sum1 > 21:
                  result['winner'] = 'draw'
                  result['is_done1'] = True
                  result['is_done2'] = True
                  result['reward'] = 0

            else:
              result['is_done1'] = True
              result['is_done2'] = True
              diff_21_player = 21 - self.current_sum1
              diff_21_dealer = 21 - self.dealer_sum

              if diff_21_player > diff_21_dealer:
                  result['reward'] = -1 * dupla
                  result['winner'] = 'dealer'
              elif diff_21_player < diff_21_dealer:
                  result['reward'] = 1 * dupla
                  result['winner'] = 'player'
              else:
                  result['reward'] = 0
                  result['winner'] = 'draw'

              return result

            return result

          elif self.done_split == 1 and self.stage == 3:
            dupla1 = 0
            dupla2 = 0
            if dd1 == 0:
              dupla1 = 1
            else:
              dupla1 = 2 * dd1
            if dd2 == 0:
              dupla2 = 1
            else:
              dupla2 = 2 * dd2

            #if mode == 'normal':
            if self.current_sum1 == 21 and self.current_sum2 == 21 and self.dealer_sum == 21 or self.current_sum1 > 21 and self.current_sum2 > 21 and self.dealer_sum > 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 0
                result['winner'] = 'draw'
                return result

            elif self.dealer_sum > 21 and self.current_sum1 <= 21 and self.current_sum2 <= 21 or self.current_sum1 == 21 and self.current_sum2 == 21 and self.dealer_sum < 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 1 * dupla1 + 1 * dupla2
                result['winner'] = 'player'
                return result

            elif self.dealer_sum == 21 and self.current_sum1 < 21 and self.current_sum2 < 21 or self.dealer_sum <= 21 and self.current_sum1 > 21 and self.current_sum2 > 21 or self.dealer_sum == 21 and self.current_sum1 < 21 and self.current_sum2 > 21 or self.dealer_sum == 21 and self.current_sum1 > 21 and self.current_sum2 < 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = -1 * dupla1 -1 * dupla2
                result['winner'] = 'dealer'
                return result

            elif self.dealer_sum == 21 and self.current_sum1 < 21 and self.current_sum2 == 21 or self.dealer_sum == 21 and self.current_sum1 > 21 and self.current_sum2 == 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = -1 * dupla1
                result['winner'] = 'dealer'
                return result

            elif self.dealer_sum == 21 and self.current_sum1 == 21 and self.current_sum2 < 21 or self.dealer_sum == 21 and self.current_sum1 == 21 and self.current_sum2 > 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = -1 * dupla2
                result['winner'] = 'dealer'
                return result

            elif self.dealer_sum > 21 and self.current_sum1 < 21 and self.current_sum2 > 21 or self.dealer_sum > 21 and self.current_sum1 == 21 and self.current_sum2 > 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 1 * dupla1
                result['winner'] = 'player'
                return result

            elif self.dealer_sum > 21 and self.current_sum1 > 21 and self.current_sum2 < 21 or self.dealer_sum > 21 and self.current_sum1 > 21 and self.current_sum2 == 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 1 * dupla2
                result['winner'] = 'player'
                return result

            elif self.dealer_sum < 21 and self.current_sum1 == 21 and self.current_sum2 > 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 1 * dupla1 - 1 * dupla2
                if 1 * dupla1 - 1 * dupla2 > 0:
                    result['winner'] = 'player'
                elif 1 * dupla1 - 1 * dupla2 == 0:
                    result['winner'] = 'draw'
                else:
                    result['winner'] = 'dealer'
                return result

            elif self.dealer_sum < 21 and self.current_sum1 > 21 and self.current_sum2 == 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = -1 * dupla1 + 1 * dupla2
                if -1 * dupla1 + 1 * dupla2 > 0:
                    result['winner'] = 'player'
                elif -1 * dupla1 + 1 * dupla2 == 0:
                    result['winner'] = 'draw'
                else:
                    result['winner'] = 'dealer'
                return result
    #ez az alsó 4 a ?-es
            elif self.dealer_sum < 21 and self.current_sum1 > 21 and self.current_sum2 < 21:
                result['is_done1'] = True
                result['is_done2'] = True
                if self.current_sum2 - self.dealer_sum > 0:
                    result['reward'] = -1 * dupla1 + 1 * dupla2
                    if -1 * dupla1 + 1 * dupla2 > 0:
                        result['winner'] = 'player'
                    elif -1 * dupla1 + 1 * dupla2 == 0:
                        result['winner'] = 'draw'
                    else:
                        result['winner'] = 'dealer'
                elif self.current_sum2 - self.dealer_sum == 0:
                    result['winner'] = 'dealer'
                    result['reward'] = -1 * dupla1
                else:
                    result['winner'] = 'dealer'
                    result['reward'] = -1 * dupla1 - 1 * dupla2
                return result

            elif self.dealer_sum < 21 and self.current_sum1 < 21 and self.current_sum2 > 21:
                result['is_done1'] = True
                result['is_done2'] = True
                if self.current_sum1 - self.dealer_sum > 0:
                    result['reward'] = 1 * dupla1 - 1 * dupla2
                    if 1 * dupla1 - 1 * dupla2 > 0:
                        result['winner'] = 'player'
                    elif 1 * dupla1 - 1 * dupla2 == 0:
                        result['winner'] = 'draw'
                    else:
                        result['winner'] = 'dealer'
                elif self.current_sum1 - self.dealer_sum == 0:
                    result['winner'] = 'dealer'
                    result['reward'] = -1 * dupla2
                else:
                    result['winner'] = 'dealer'
                    result['reward'] = -1 * dupla1 - 1 * dupla2
                return result

            elif self.dealer_sum < 21 and self.current_sum1 < 21 and self.current_sum2 == 21:
                result['is_done1'] = True
                result['is_done2'] = True
                if self.current_sum1 - self.dealer_sum > 0:
                    result['reward'] = 1 * dupla1 + 1 * dupla2
                    result['winner'] = 'player'
                elif self.current_sum1 - self.dealer_sum == 0:
                    result['winner'] = 'player'
                    result['reward'] = 1 * dupla2
                else:
                    result['reward'] = -1 * dupla1 + 1 * dupla2
                    if -1 * dupla1 + 1 * dupla2 > 0:
                        result['winner'] = 'player'
                    elif -1 * dupla1 + 1 * dupla2 == 0:
                        result['winner'] = 'draw'
                    else:
                        result['winner'] = 'dealer'
                return result

            elif self.dealer_sum < 21 and self.current_sum1 == 21 and self.current_sum2 < 21:
                result['is_done1'] = True
                result['is_done2'] = True
                if self.current_sum2 - self.dealer_sum > 0:
                    result['reward'] = 1 * dupla1 + 1 * dupla2
                    result['winner'] = 'player'
                elif self.current_sum2 - self.dealer_sum == 0:
                    result['winner'] = 'player'
                    result['reward'] = 1 * dupla1
                else:
                    result['reward'] = 1 * dupla1 - 1 * dupla2
                    if 1 * dupla1 - 1 * dupla2 > 0:
                        result['winner'] = 'player'
                    elif 1 * dupla1 - 1 * dupla2 == 0:
                        result['winner'] = 'draw'
                    else:
                        result['winner'] = 'dealer'
                return result

            #elif mode == 'compare':
            else:
                result['is_done1'] = True
                result['is_done2'] = True
                diff_21_player1 = 21 - self.current_sum1
                diff_21_player2 = 21 - self.current_sum2
                diff_21_dealer = 21 - self.dealer_sum

                if diff_21_player1 > diff_21_dealer and diff_21_player2 > diff_21_dealer:
                    result['winner'] = 'dealer'
                    result['reward'] = -1 * dupla1 - 1 * dupla2
                elif diff_21_player1 < diff_21_dealer and diff_21_player2 < diff_21_dealer:
                    result['winner'] = 'player'
                    result['reward'] = 1 * dupla1 + 1 * dupla2
                elif diff_21_player1 < diff_21_dealer and diff_21_player2 > diff_21_dealer:
                    result['reward'] = 1 * dupla1 - 1 * dupla2
                    if 1 * dupla1 - 1 * dupla2 > 0:
                        result['winner'] = 'player'
                    elif 1 * dupla1 - 1 * dupla2 == 0:
                        result['winner'] = 'draw'
                    else:
                        result['winner'] = 'dealer'
                else:
                    result['reward'] = -1 * dupla1 + 1 * dupla2
                    if 1 * dupla1 - 1 * dupla2 > 0:
                        result['winner'] = 'player'
                    elif 1 * dupla1 - 1 * dupla2 == 0:
                        result['winner'] = 'draw'
                    else:
                        result['winner'] = 'dealer'
                return result

        return result

    def step(self,action):

        self.step_count += 1  ### Number of actions taken in the game till now

        result = {'winner':'',
                 'is_done1': False,
                 'is_done2': False,
                 'reward':0}

        ### Before taking the first step of the game we need to check for "natural"
        ### winning condition if the initial two cards of the players are 21
        ### If anyone has 21, then that player wins, if both have 21, then the game is
        ### drawn. Otherwise the game will continue


        if self.stage == 1:

          if self.step_count == 1:
            if self.check_usable_ace(self.current_hand1):
                self.current_hand1 = self.use_ace(self.current_hand1)
            if self.check_usable_ace(self.dealer_hand):
                self.dealer_hand = self.use_ace(self.dealer_hand)

            if self.current_sum1 == 21 and self.dealer_sum == 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 0
                result['winner'] = 'draw'
                return result

            elif self.current_sum1 == 21 and self.dealer_sum < 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = 1
                result['winner'] = 'player'
                return result

            elif self.dealer_sum == 21 and self.current_sum1 < 21:
                result['is_done1'] = True
                result['is_done2'] = True
                result['reward'] = -1
                result['winner'] = 'dealer'
                return result

            if self.dealer_sum >= 17:
                self.dealer_stick = True

          if action == 0:

            if self.done_split == 0:
              ### Player Takes Turn
              self.take_turn('player', self.current_hand1, 1)
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              if result['is_done1'] == True:
                self.stage = 3
                result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
                result['is_done2'] = True
                return result

            else:
              self.take_turn('player', self.current_hand1, 1)
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              if result['is_done1'] == True:
                self.stage = 2
                self.take_turn('player', self.current_hand2, 2)
                return result


          if action == 1:

            if self.done_split == 0:
              if self.stick_happened == False:
                self.stick_happened = True
              if self.dealer_stick == True:
                return self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')

              while self.dealer_sum < 17:
                self.take_turn('dealer', self.dealer_hand, 0)
                result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              self.dealer_stick = True
              self.stage = 3
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
              return result


            else:
              self.stage = 2
              self.take_turn('player', self.current_hand2, 2)
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              result['is_done1'] = True
              return result

          if action ==2: ### double down

            if self.done_split == 0:
              self.double_count1 = 1
              ### Player Takes Turn
              self.take_turn('player', self.current_hand1, 1)
              if self.dealer_stick == True:  ### if dealer has already sticked
                    return self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
              while self.dealer_sum < 17:
                self.take_turn('dealer', self.dealer_hand, 0)
                result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              self.stage = 3
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
              return result
            else:
              self.double_count1 = 1
              self.take_turn('player', self.current_hand1, 1)
              self.stage = 2
              self.take_turn('player', self.current_hand2, 2)
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              return result


          if action == 3 and self.can_split == 1 and self.done_split != 1: ### split
            self.can_split = 0
            self.done_split = 1
            elso_elem = np.array([self.current_hand1[0]], dtype=self.current_hand1.dtype)
            masodik_elem = np.array([self.current_hand1[1]], dtype=self.current_hand1.dtype)
            self.current_hand1 = elso_elem
            self.current_hand2 = masodik_elem
            self.take_turn('player', self.current_hand1, 1)
            ### Checking game status
            result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
            return result

        elif self.stage == 2:

          if action == 0:

            self.take_turn('player', self.current_hand2, 2)

            ### Checking game status
            result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
            if result['is_done2'] == True:
                #return result
                while self.dealer_sum < 17:
                    self.take_turn('dealer', self.dealer_hand, 0)
                    result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
                self.stage = 3
                result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
                #result['is_done1'] = True
                return result



          if action == 1:  ### stick

              if self.stick_happened == False:
                self.stick_happened = True
              ### Dealers Turn
              while self.dealer_sum < 17:
                self.take_turn('dealer', self.dealer_hand, 0)
                result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              self.stage = 3
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
              result['is_done2'] = True
              return result


          if action == 2: ### double down

              self.double_count2 = 1
              ### Player Takes Turn
              self.take_turn('player', self.current_hand2, 2)

              ### Dealers Turn
              while self.dealer_sum < 17:
                self.take_turn('dealer', self.dealer_hand, 0)
                result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'check')
              self.stage = 3
              result = self.check_game_status(dd1 = self.double_count1, dd2 = self.double_count2, mode = 'final')
              result['is_done2'] = True
              return result

        return result

    def get_current_state(self):
        '''
        returns the current state variables, current_sum, dealer_showing_card, usable_ace
        '''
        current_state = {}

        current_state['current_sum1'] = self.current_sum1
        current_state['current_sum2'] = self.current_sum2
        current_state['dealer_showing_card'] = self.dealer_showing_card
        current_state['stage'] = self.stage
        current_state['usable_ace1'] = self.usable_ace1
        current_state['usable_ace2'] = self.usable_ace2
        current_state['stick_happened'] = self.stick_happened
        current_state['double_count1'] = self.double_count1
        current_state['double_count2'] = self.double_count2
        current_state['can_split'] = self.can_split
        current_state['done_split'] = self.done_split

        return current_state


    def render(self):

        print('OBSERVABLE STATES')
        print('Current Sum 1 - {}'.format(self.current_sum1))
        print('Current Sum 2 - {}'.format(self.current_sum2))
        print('Dealer Showing Card - {}'.format(self.dealer_showing_card))
        print('Stage - {}'.format(self.stage))
        print('Usable Ace1 - {}'.format(self.usable_ace1))
        print('Usable Ace2 - {}'.format(self.usable_ace2))
        print('Stick happened - {}'.format(self.stick_happened))
        print('Double down 1 - {}'.format(self.double_count1))
        print('Double down 2 - {}'.format(self.double_count2))
        print('Can split - {}'.format(self.can_split))
        print('Done split - {}'.format(self.done_split))

        print('AUXILLARY INFORMATION ------------------------------')
        print('Current Hand 1 - {}'.format(self.current_hand1))
        print('Current Hand 2 - {}'.format(self.current_hand2))
        print('Dealer Hand - {}'.format(self.dealer_hand))
        print('Dealer Sum - {}'.format(self.dealer_sum))

In [None]:
bj = BlackJackEnv() #a double down miatt mindig újra kell ezt indítani

In [None]:
bj.reset()
bj.render()

In [None]:
print(bj.step(1))
bj.render()

#### Q-Learning

In [None]:
#### following are 4 dictionaries which help in converting the
#### state values like current_sum and action to indexes in the Q value table

current_sum_to_index1 = dict(zip(np.arange(4,33),np.arange(29)))
current_sum_to_index2 = dict(zip(np.arange(4,33),np.arange(1,30)))
current_sum_to_index2 = {0: 0, **current_sum_to_index2}
dealer_showing_card_to_index = dict(zip(np.arange(1,11),np.arange(10)))
stage_index = dict(zip(np.arange(1,4),np.arange(3)))
usable_ace_index1 = dict(zip([False,True],[0,1]))
usable_ace_index2 = dict(zip([False,True],[0,1]))
stick_happened_index = dict(zip([False,True],[0,1]))
double_count_index1 = dict(zip(np.arange(0,2),np.arange(2)))
double_count_index2 = dict(zip(np.arange(0,2),np.arange(2)))
can_split_index = dict(zip(np.arange(0,2),np.arange(2)))
done_split_index = dict(zip(np.arange(0,2),np.arange(2)))
action_index = dict(zip(['hit','stick', 'double_down', 'split'],[0,1,2,3]))

def get_state_q_indices(current_state):

    '''
    used to get indices of the Q table for any given state

    '''
    current_sum_idx1 = current_sum_to_index1[current_state['current_sum1']]
    current_sum_idx2 = current_sum_to_index2[current_state['current_sum2']]
    dealer_showing_card_idx = dealer_showing_card_to_index[current_state['dealer_showing_card']]
    stage_idx = stage_index[current_state['stage']]
    usable_ace_idx1 = usable_ace_index1[current_state['usable_ace1']]
    usable_ace_idx2 = usable_ace_index2[current_state['usable_ace2']]
    stick_happened_idx = stick_happened_index[current_state['stick_happened']]
    double_count_idx1 = double_count_index1[current_state['double_count1']]
    double_count_idx2 = double_count_index2[current_state['double_count2']]
    can_split_idx = can_split_index[current_state['can_split']]
    done_split_idx = done_split_index[current_state['done_split']]

    return [current_sum_idx1,current_sum_idx2,dealer_showing_card_idx,stage_idx,usable_ace_idx1,usable_ace_idx2,stick_happened_idx,double_count_idx1,double_count_idx2,can_split_idx,done_split_idx]

def get_max_action(Q_sa, current_state):

    '''
    used to get the action with the max q-value given the current state and the Q table

    '''

    state_q_idxs = get_state_q_indices(current_state)
    action = Q_sa[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],state_q_idxs[4],state_q_idxs[5],state_q_idxs[6],state_q_idxs[7],state_q_idxs[8],state_q_idxs[9],state_q_idxs[10],:].argmax()

    return action

def get_q_value(Q_sa, state, action):
    '''
    used to get Q value for any given state and action, given the Q table

    '''
    state_q_idxs = get_state_q_indices(state)
    q_value = Q_sa[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],state_q_idxs[4],state_q_idxs[5],state_q_idxs[6],state_q_idxs[7],state_q_idxs[8],state_q_idxs[9],state_q_idxs[10],action]

    return q_value
#print(current_sum_to_index2)

In [None]:
### first dimension  - current sum 1 (4-30)
### second dimension - current sum 2 (0, 4-30)
### third dimension  - dealers showing card (1-10)
### fourth dimension - stage (1-3)
### fifth dimension - usable ace 1 (False,True)
### sixth dimension - usable ace 2 (False,True)
### seventh dimension - stick happened (False,True)
### eighth dimension - double down 1 (0,1)
### ninth dimension - double down 2 (0,1)
### tenth dimension - can split (0,1)
### eleventh dimension - split done (0,1)
### twelfth dimension - action (hit, stick, double down, split)

Q_opt = np.zeros((29, 30, 10, 3, 2, 2, 2, 2, 2, 2, 2, 4)) #### Initializing the Q value Table with zeros

In [None]:
episode_count = 0
total_episodes = 100000
gamma = 0.9             #### the discount factor
alpha = 0.1             #### learning rate
bj = BlackJackEnv()

# Initialize variables for tracking runtime and errors
start_time = time.time()

while episode_count < total_episodes:


    bj.reset()  ### Initialize S (the environment's starting state)


    current_state = bj.get_current_state()
    current_action = get_max_action(Q_opt, current_state)


    ### Take Action
    step_result = bj.step(current_action)

    next_state = bj.get_current_state()
    next_max_action = get_max_action(Q_opt, next_state)
    immediate_reward = step_result['reward']



    next_state_q_idxs = get_state_q_indices(next_state)

    #### Get Q value for the next state and max action in the next state
    q_max_s_a = get_q_value(Q_opt, next_state, next_max_action)

    td_target = immediate_reward + gamma * q_max_s_a

    #### Getting Q value for the current state and action
    q_current_s_a = get_q_value(Q_opt, current_state, current_action)

    td_error = td_target - q_current_s_a

    state_q_idxs = get_state_q_indices(current_state)
    #print(state_q_idxs)
    #print(q_current_s_a + alpha*td_error)
    #### Updating current Q(S,A)
    Q_opt[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],state_q_idxs[4],state_q_idxs[5],state_q_idxs[6],state_q_idxs[7],state_q_idxs[8],state_q_idxs[9],state_q_idxs[10],current_action] = q_current_s_a + alpha*td_error

    current_state = next_state  ### S=S'

    if step_result['is_done1'] and step_result['is_done2']:
        episode_count+=1

        if episode_count%10000 == 0:
            print('---------Episode - {} -----------'.format(episode_count))

In [None]:
Q_opt

In [None]:
Q = np.zeros((29, 30, 10, 3, 2, 2, 2, 2, 2, 2, 2, 4))
episode_count = 0
total_episodes = 100000
gamma = 0.9             #### the discount factor
alpha = 0.1             #### learning rate
bj = BlackJackEnv()

# Initialize variables for tracking runtime and errors
start_time = time.time()
diffs = []


while episode_count < total_episodes:


    bj.reset()  ### Initialize S (the environment's starting state)


    current_state = bj.get_current_state()
    current_action = get_max_action(Q, current_state)


    ### Take Action
    step_result = bj.step(current_action)

    next_state = bj.get_current_state()
    next_max_action = get_max_action(Q, next_state)
    immediate_reward = step_result['reward']

    next_state_q_idxs = get_state_q_indices(next_state)

    #### Get Q value for the next state and max action in the next state
    q_max_s_a = get_q_value(Q, next_state, next_max_action)
    #print(immediate_reward)
    td_target = immediate_reward + gamma * q_max_s_a

    #### Getting Q value for the current state and action
    q_current_s_a = get_q_value(Q, current_state, current_action)

    td_error = td_target - q_current_s_a

    state_q_idxs = get_state_q_indices(current_state)

    #### Updating current Q(S,A)
    Q[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],state_q_idxs[4],state_q_idxs[5],state_q_idxs[6],state_q_idxs[7],state_q_idxs[8],state_q_idxs[9],state_q_idxs[10],current_action] = q_current_s_a + alpha*td_error

    current_state = next_state  ### S=S'

    if step_result['is_done1'] and  step_result['is_done2']:
        episode_count+=1
        diffs.append(np.max(np.abs(Q_opt - Q)))
        #print(np.max(np.abs(Q_opt - Q)))

        if episode_count%10000 == 0:
            print('---------Episode - {} -----------'.format(episode_count))

# Calculate total runtime
end_time = time.time()
runtime = end_time - start_time

print(f"Total runtime: {runtime} seconds")

plt.figure(figsize=(10, 6))
plt.plot(diffs)
#plt.plot(range(1, len(deltas)+1), deltas)
plt.xlabel('Iteration')
plt.ylabel('Delta')
plt.title('Convergence of Q Learning'),
plt.legend()
plt.grid()
plt.savefig("q_learning.jpg", format="jpg", dpi=300)
plt.show()

In [None]:
print(len(diffs))

#### SARSA

In [None]:
def get_action_epsilon_greedy(Q_sa, current_state, epsilon):
    '''
    Get action using epsilon-greedy policy.
    '''
    if np.random.rand() < epsilon:
        return np.random.choice([0, 1])  # random action
    else:
        return get_max_action(Q_sa, current_state)

In [None]:
Q = np.zeros((29, 30, 10, 3, 2, 2, 2, 2, 2, 2, 2, 4)) #### Initializing the Q value Table with zeros
episode_count = 0
total_episodes = 1000
gamma = 0.9             #### the discount factor
alpha = 0.1             #### learning rate
epsilon = 0.1           #### epsilon for epsilon-greedy policy
bj = BlackJackEnv()

# Initialize variables for tracking runtime and errors
start_time = time.time()
diffs2 = []

while episode_count < total_episodes:
    bj.reset()  ### Initialize S (the environment's starting state)

    current_state = bj.get_current_state()
    current_action = get_action_epsilon_greedy(Q, current_state, epsilon)

    step_result = bj.step(current_action)

    next_state = bj.get_current_state()
    next_action = get_action_epsilon_greedy(Q, next_state, epsilon)
    immediate_reward = step_result['reward']


    q_current_s_a = get_q_value(Q, current_state, current_action)
    q_next_s_a = get_q_value(Q, next_state, next_action)

    td_target = immediate_reward + gamma * q_next_s_a
    td_error = td_target - q_current_s_a

    Q_state_idxs = get_state_q_indices(current_state)
    Q[state_q_idxs[0],state_q_idxs[1],state_q_idxs[2],state_q_idxs[3],state_q_idxs[4],state_q_idxs[5],state_q_idxs[6],state_q_idxs[7],state_q_idxs[8],state_q_idxs[9],state_q_idxs[10], current_action] = q_current_s_a + alpha * td_error

    current_state = next_state  ### S=S'
    current_action = next_action  ### A=A'


    if step_result['is_done1'] and  step_result['is_done2']:
        episode_count+=1
        diffs2.append(np.max(np.abs(Q_opt - Q)))

        if episode_count%100 == 0:
            print('---------Episode - {} -----------'.format(episode_count))

# Calculate total runtime
end_time = time.time()
runtime = end_time - start_time

print(f"Total runtime: {runtime} seconds")

plt.figure(figsize=(10, 6))
plt.plot(diffs2)
#plt.plot(range(1, len(deltas)+1), deltas)
plt.xlabel('Iteration')
plt.ylabel('Delta')
plt.title('Convergence of SARSA')
plt.legend()
plt.grid()
plt.savefig("sarsa.jpg", format="jpg", dpi=300)
plt.show()

###MDP solution approaches

In [None]:
class Sol_Env(gym.Env):
    def __init__(self):
        # Define ranges for each state variable
        self.player_sum_1_range = range(4, 32)  # Current sum 1, maybe 31?
        self.player_sum_2_range = [0] + list(range(4, 32))  # Current sum 2, maybe 31?
        self.dealer_sum_range = range(4, 29)  # Example range for dealer sum, maybe 27?
        self.stage_range = range(1, 4)  # Stages 1, 2, 3
        self.usable_ace_1 = [False, True]  # Usable ace for hand 1
        self.usable_ace_2 = [False, True]  # Usable ace for hand 2
        self.stick_happened = [False, True]  # Usable ace for the dealer
        self.double_down_1 = [0, 1]  # Double down status for hand 1
        self.double_down_2 = [0, 1]  # Double down status for hand 2
        self.can_split = [0, 1]  # Whether split is possible
        self.split_done = [0, 1]  # Whether split has been done

        # Define actions
        self.actions = ['hit', 'stick', 'double_down', 'split']

    def get_all_states(self):
        """Generate all possible states."""
        states = list(product(
            self.player_sum_1_range,  # Current sum 1
            self.player_sum_2_range,  # Current sum 2
            self.dealer_sum_range,  # Dealer's showing card
            self.stage_range,  # Stage
            self.usable_ace_1,  # Usable ace for hand 1
            self.usable_ace_2,  # Usable ace for hand 2
            self.stick_happened,   # Usable ace for the dealer
            self.double_down_1,  # Double down status for hand 1
            self.double_down_2,  # Double down status for hand 2
            self.can_split,  # Can split
            self.split_done  # Split done
        ))
        # Apply filters to exclude invalid states
        filtered_states = [
            state for state in states
            if not (state[9] == 1 and state[10] == 1)  # Filter out states where can_split = 1 and split_done = 1
            and not (state[1] == 0 and state[3] == 2)  # Filter out states where player_sum_2 = 0 and stage = 2
            and not (state[10] == 0 and state[1] != 0)  # Filter out states where split_done = 0 but player_sum_2 != 0
            and not (state[10] == 1 and state[1] == 0)  # Filter out states where split_done = 1 but player_sum_2 == 0
            and not (state[0] < 10 and state[4])  # Filter out states where player_sum_1 < 10 but usable_ace_1 is True
            and not (state[1] < 10 and state[5])  # Filter out states where player_sum_2 < 10 but usable_ace_2 is True
            #and not (state[1] == 0 and state[5] )   Filter out states where player_sum_2 = 0 and usable_ace_2 = True
            and not (state[1] == 0 and state[8] == 1)  # Filter out states where player_sum_2 = 0 and double_down_2 = 1
            and not (state[3] == 1 and state[7] == 1 and state[8] == 1 or state[3] == 2 and state[7] == 1 and state[8] == 1)
            and not (state[3] == 1 and state[7] == 0 and state[8] == 1)
                           ]
        return filtered_states

    def get_possible_actions(self, state):
        """Return possible actions for a given state."""
        player_sum_1_range, _, _, stage, _, _, _, _, _, can_split, _ = state
        if stage == 3:  # If it's the third stage, no further actions are possible
            return []
        actions = ['hit', 'stick', 'double_down']
        if can_split and player_sum_1_range % 2 == 0 and stage == 1:
            actions.append('split')
        return actions

    def get_reward(self, player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, done_split):
        """
        Calculate the reward for a given state.
        """
        if done_split == 0 and stage == 3:
            dupla = 0
            if dd1 == 0:
              dupla = 1
            else:
              dupla = 2 * dd1

            if player_sum_1 > 21 and dealer_sum == 21 or dealer_sum == 21 and player_sum_1 < 21 or dealer_sum < 21 and player_sum_1 > 21:
                  return -1 * dupla
            elif player_sum_1 == 21 and dealer_sum > 21 or dealer_sum < 21 and player_sum_1 == 21 or dealer_sum > 21 and player_sum_1 < 21:
                  return 1 * dupla

            elif player_sum_1 == 21 and dealer_sum == 21 or dealer_sum > 21 and player_sum_1 > 21:
                  return 0

            else:
              diff_21_player = 21 - player_sum_1
              diff_21_dealer = 21 - dealer_sum
              if diff_21_player > diff_21_dealer:
                  return -1 * dupla
              elif diff_21_player < diff_21_dealer:
                  return 1 * dupla
              else:
                  return 0

        elif done_split == 1 and stage == 3:
            dupla1 = 0
            dupla2 = 0
            if dd1 == 0:
              dupla1 = 1
            else:
              dupla1 = 2 * dd1
            if dd2 == 0:
              dupla2 = 1
            else:
              dupla2 = 2 * dd2

            #if mode == 'normal':
            if player_sum_1 == 21 and player_sum_2 == 21 and dealer_sum == 21 or player_sum_1 > 21 and player_sum_2 > 21 and dealer_sum > 21:
                return 0

            elif dealer_sum > 21 and player_sum_1 <= 21 and player_sum_2 <= 21 or player_sum_1 == 21 and player_sum_2 == 21 and dealer_sum < 21:
                return 1 * dupla1 + 1 * dupla2

            elif dealer_sum == 21 and player_sum_1 and player_sum_2 < 21 or dealer_sum <= 21 and player_sum_1 > 21 and player_sum_2 > 21 or dealer_sum == 21 and player_sum_1 < 21 and player_sum_2 > 21 or dealer_sum == 21 and player_sum_1 > 21 and player_sum_2 < 21:
                return -1 * dupla1 -1 * dupla2

            elif dealer_sum == 21 and player_sum_1 < 21 and player_sum_2 == 21 or dealer_sum == 21 and player_sum_1 > 21 and player_sum_2 == 21:
                return -1 * dupla1

            elif dealer_sum == 21 and player_sum_1 == 21 and player_sum_2 < 21 or dealer_sum == 21 and player_sum_1 == 21 and player_sum_2 > 21:
                return -1 * dupla2

            elif dealer_sum > 21 and player_sum_1 < 21 and player_sum_2 > 21 or dealer_sum > 21 and player_sum_1 == 21 and player_sum_2 > 21:
                return 1 * dupla1

            elif dealer_sum > 21 and player_sum_1 > 21 and player_sum_2 < 21 or dealer_sum > 21 and player_sum_1 > 21 and player_sum_2 == 21:
                return 1 * dupla2

            elif dealer_sum < 21 and player_sum_1 == 21 and player_sum_2 > 21:
                return 1 * dupla1 - 1 * dupla2

            elif dealer_sum < 21 and player_sum_1 > 21 and player_sum_2 == 21:
                return -1 * dupla1 + 1 * dupla2

            elif dealer_sum < 21 and player_sum_1 > 21 and player_sum_2 < 21:
                if player_sum_2 - dealer_sum > 0:
                    return -1 * dupla1 + 1 * dupla2
                elif player_sum_2 - dealer_sum == 0:
                    return -1 * dupla1
                else:
                    return -1 * dupla1 - 1 * dupla2

            elif dealer_sum < 21 and player_sum_1 < 21 and player_sum_2 > 21:
                if player_sum_1 - dealer_sum > 0:
                    return 1 * dupla1 - 1 * dupla2
                elif player_sum_1 - dealer_sum == 0:
                    return -1 * dupla2
                else:
                    return -1 * dupla1 - 1 * dupla2

            elif dealer_sum < 21 and player_sum_1 < 21 and player_sum_2 == 21:
                if player_sum_1 - dealer_sum > 0:
                    return 1 * dupla1 + 1 * dupla2
                elif player_sum_1 - dealer_sum == 0:
                    return 1 * dupla2
                else:
                    return -1 * dupla1 + 1 * dupla2

            elif dealer_sum < 21 and player_sum_1 == 21 and player_sum_2 < 21:
                if player_sum_2 - dealer_sum > 0:
                    return 1 * dupla1 + 1 * dupla2
                elif player_sum_2 - dealer_sum == 0:
                    return 1 * dupla1
                else:
                    return 1 * dupla1 - 1 * dupla2

            else:
                diff_21_player1 = 21 - player_sum_1
                diff_21_player2 = 21 - player_sum_2
                diff_21_dealer = 21 - dealer_sum

                if diff_21_player1 > diff_21_dealer and diff_21_player2 > diff_21_dealer:
                    return -1 * dupla1 - 1 * dupla2
                elif diff_21_player1 < diff_21_dealer and diff_21_player2 < diff_21_dealer:
                    return 1 * dupla1 + 1 * dupla2
                elif diff_21_player1 < diff_21_dealer and diff_21_player2 > diff_21_dealer:
                    return 1 * dupla1 - 1 * dupla2
                else:
                    return -1 * dupla1 + 1 * dupla2

        return 0


    def get_transition_probabilities(self, state, action):
        """
        Calculate transition probabilities for a given state-action pair.
        """
        distr = [1 / 13] * 8 + [4 / 13] + [1 / 13]  # Probabilities for cards 2–11
        card_values = list(range(2, 12))  # Cards are valued from 2 to 11
        distr_dict = {card_values[i]: distr[i] for i in range(len(card_values))}

        transitions = []
        player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done = state

        if action == 'hit':
            if stage == 1:
              if split_done == 0:
                if player_sum_1 >= 21:
                  pass
                  """stage = 3
                  next_state = (player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                  reward = self.get_reward(player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                  transitions.append((next_state, 1.0, reward))"""
                else:
                  for card, prob in distr_dict.items():
                    new_sum = player_sum_1 + card
                    new_ace_1 = ace_1
                    if new_sum > 21 and new_ace_1:
                        new_sum -= 10
                        new_ace_1 = False
                    if card == 11 and player_sum_1 < 11 and not new_ace_1:
                       new_ace_1 = True
                    if new_sum >= 21:
                      stage = 3
                      next_state = (new_sum, player_sum_2, dealer_sum, stage, new_ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                      reward = self.get_reward(new_sum, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                      transitions.append((next_state, prob, reward))
                    else:
                      next_state = (new_sum, player_sum_2, dealer_sum, stage, new_ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                      reward = self.get_reward(new_sum, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                      transitions.append((next_state, prob, reward))
              else:
                if player_sum_1 >= 21:
                  pass
                  """stage = 2
                  next_state = (player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                  reward = self.get_reward(player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                  transitions.append((next_state, 1.0, reward))"""
                else:
                  for card, prob in distr_dict.items():
                    new_sum = player_sum_1 + card
                    new_ace_1 = ace_1
                    if new_sum > 21 and ace_1:
                        new_sum -= 10
                        new_ace_1 = False
                    if card == 11 and player_sum_1 < 11 and not new_ace_1:
                       new_ace_1 = True
                    if new_sum >= 21:
                      stage = 2
                      next_state = (new_sum, player_sum_2, dealer_sum, stage, new_ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                      reward = self.get_reward(new_sum, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                      transitions.append((next_state, prob, reward))
                    else:
                      next_state = (new_sum, player_sum_2, dealer_sum, stage, new_ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                      reward = self.get_reward(new_sum, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                      transitions.append((next_state, prob, reward))
            elif stage == 2:
              if player_sum_2 >= 21:
                pass
                """stage = 3
                next_state = (player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                reward = self.get_reward(player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                transitions.append((next_state, 1.0, reward))"""
              else:
                for card, prob in distr_dict.items():
                  new_sum = player_sum_2 + card
                  new_ace_2 = ace_2
                  if new_sum > 21 and ace_2:
                      new_sum -= 10
                      new_ace_2 = False
                  if card == 11 and player_sum_2 < 11 and not new_ace_2:
                       new_ace_2 = True
                  if new_sum >= 21:
                    stage = 3
                    next_state = (player_sum_1, new_sum, dealer_sum, stage, ace_1, new_ace_2, stick_happened, dd1, dd2, can_split, split_done)
                    reward = self.get_reward(player_sum_1, new_sum, dealer_sum, stage, dd1, dd2, split_done)
                    transitions.append((next_state, prob, reward))
                  else:
                    next_state = (player_sum_1, new_sum, dealer_sum, stage, ace_1, new_ace_2, stick_happened, dd1, dd2, can_split, split_done)
                    reward = self.get_reward(player_sum_1, new_sum, dealer_sum, stage, dd1, dd2, split_done)
                    transitions.append((next_state, prob, reward))

        elif action == 'stick': # usable ace for the dealer?
            if split_done == 0:
              if stage == 1:
                if dealer_sum < 18:
                  for card, prob in distr_dict.items():
                    new_dealer_sum = dealer_sum + card
                    if stick_happened == False:
                      stick_happened = True
                    new_stage = 3
                    next_state = (player_sum_1, player_sum_2, new_dealer_sum, new_stage, ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                    transitions.append((next_state, 1.0, self.get_reward(player_sum_1, player_sum_2, new_dealer_sum, new_stage, dd1, dd2, split_done)))
                else:
                  pass
                  """new_stage = 3
                  next_state = (player_sum_1, player_sum_2, dealer_sum, new_stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                  transitions.append((next_state, 1.0, self.get_reward(player_sum_1, player_sum_2, dealer_sum, new_stage, dd1, dd2, split_done)))"""

            else:
              if stage == 1:
                new_stage = 2
                next_state = (player_sum_1, player_sum_2, dealer_sum, new_stage, ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                transitions.append((next_state, 1.0, self.get_reward(player_sum_1, player_sum_2, dealer_sum, new_stage, dd1, dd2, split_done)))
              elif stage == 2:
                if dealer_sum < 18:
                  for card, prob in distr_dict.items():
                    new_dealer_sum = dealer_sum + card
                    if stick_happened == False:
                       stick_happened = True
                    new_stage = 3
                    next_state = (player_sum_1, player_sum_2, new_dealer_sum, new_stage, ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                    transitions.append((next_state, 1.0, self.get_reward(player_sum_1, player_sum_2, new_dealer_sum, new_stage, dd1, dd2, split_done)))
                else:
                  pass
                  """new_stage = 3
                  next_state = (player_sum_1, player_sum_2, dealer_sum, new_stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                  transitions.append((next_state, 1.0, self.get_reward(player_sum_1, player_sum_2, dealer_sum, new_stage, dd1, dd2, split_done)))"""

        elif action == 'double_down':
          if stage == 1 and dd1 == 0:
            if split_done == 0:
              if player_sum_1 >= 21:
                pass
                """stage = 3
                next_state = (player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                reward = self.get_reward(player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                transitions.append((next_state, 1.0, reward))"""
              else:
                dd1 = 1
                for card, prob in distr_dict.items():
                  new_sum = player_sum_1 + card
                  new_ace_1 = ace_1
                  if new_sum > 21 and ace_1:
                      new_sum -= 10
                      new_ace_1 = False
                  if card == 11 and player_sum_1 < 11 and not new_ace_1:
                       new_ace_1 = True
                  stage = 3
                  next_state = (new_sum, player_sum_2, dealer_sum, stage, new_ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                  reward = self.get_reward(new_sum, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                  transitions.append((next_state, 1.0, reward))
            else:
              dd1 = 1
              if player_sum_1 >= 21:
                pass
                """stage = 2
                next_state = (player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
                reward = self.get_reward(player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                transitions.append((next_state, 1.0, reward))"""
              else:
                for card, prob in distr_dict.items():
                  new_sum = player_sum_1 + card
                  new_ace_1 = ace_1
                  if new_sum > 21 and ace_1:
                      new_sum -= 10
                      new_ace_1 = False
                  if card == 11 and player_sum_1 < 11 and not new_ace_1:
                       new_ace_1 = True
                  stage = 2
                  next_state = (new_sum, player_sum_2, dealer_sum, stage, new_ace_1, ace_2, stick_happened, dd1, dd2, can_split, split_done)
                  reward = self.get_reward(new_sum, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
                  transitions.append((next_state, 1.0, reward))

          elif stage == 2 and dd2 == 0:
            dd2 = 1
            if player_sum_2 >= 21:
              pass
              """stage = 3
              next_state = (player_sum_1, player_sum_2, dealer_sum, stage, ace_1, ace_2, dd1, dd2, can_split, split_done)
              reward = self.get_reward(player_sum_1, player_sum_2, dealer_sum, stage, dd1, dd2, split_done)
              transitions.append((next_state, 1.0, reward))"""
            else:
              for card, prob in distr_dict.items():
                new_sum = player_sum_2 + card
                new_ace_2 = ace_2
                if new_sum > 21 and ace_2:
                    new_sum -= 10
                    new_ace_2 = False
                if card == 11 and player_sum_2 < 11 and not new_ace_2:
                  new_ace_2 = True
                stage = 3
                next_state = (player_sum_1, new_sum, dealer_sum, stage, ace_1, new_ace_2, stick_happened, dd1, dd2, can_split, split_done)
                reward = self.get_reward(player_sum_1, new_sum, dealer_sum, stage, dd1, dd2, split_done)
                transitions.append((next_state, 1.0, reward))

        elif action == 'split':
            if can_split and stage == 1 and not split_done and not ace_1 and not ace_2 and dd1 == 0 and dd2 == 0 and dealer_sum <= 17:
                split_state = (player_sum_1 // 2, player_sum_1 // 2, dealer_sum, stage, ace_1, ace_2, stick_happened, dd1, dd2, 0, 1)
                transitions.append((split_state, 1.0, 0))  # No immediate reward for splitting

        return transitions

In [None]:
bj = Sol_Env()
#len(bj.get_all_states())
#bj.get_all_states()[:500]

Value iteration

In [None]:
def value_iteration_to_get_opt_solution(env, n = 10000, theta=1e-10):
    """Perform value iteration to find the optimal policy."""
    # Get all states and filter out terminal ones
    all_states = env.get_all_states()
    non_terminal_states = [state for state in all_states if state[3] != 3]
    # Define additional states to include in V and policy
    additional_states = [
    (2, 2, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
    for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
    for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states

    # Define additional states to include in V and policy
    additional_states2 = [
        (3, 3, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
        for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
        for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states2

    V = {state: 0 for state in all_states}  # Initialize value function
    policy = {state: None for state in all_states}  # Initialize policy

    iterations = 0
    delta_list = []  # To track convergence

    while iterations != n:
        delta = 0
        W = V
        for state in non_terminal_states:
            old_value = V[state]
            max_value = float('-inf')
            best_action = None
            #print(state)
            for action in env.get_possible_actions(state):
                #print(action)
                transitions = env.get_transition_probabilities(state, action)
                #for i in transitions:
                  #print(i)
                action_value = sum(
                    prob * (reward + W[next_state])
                    for next_state, prob, reward in transitions
                )
                """action_value = 0
                for next_state, prob, reward in transitions:
                  print(f"Transition: next_state={next_state}, prob={prob}, reward={reward}")
                  action_value += prob * (reward + gamma * V[next_state])"""
                if action_value > max_value:
                    max_value = action_value
                    best_action = action

            V[state] = max_value
            policy[state] = best_action
            delta = max(delta, abs(old_value - V[state]))

        delta_list.append(delta)
        iterations += 1
        if delta < theta:
            break

    return V

In [None]:
V_opt = value_iteration_to_get_opt_solution(bj)

In [None]:
def value_iteration(env, V_opt, n = 10000, theta=1e-6):
    """Perform value iteration to find the optimal policy."""
    # Get all states and filter out terminal ones
    all_states = env.get_all_states()
    non_terminal_states = [state for state in all_states if state[3] != 3]
    # Define additional states to include in V and policy
    additional_states = [
    (2, 2, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
    for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
    for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states

    # Define additional states to include in V and policy
    additional_states2 = [
        (3, 3, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
        for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
        for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states2

    V = {state: 0 for state in all_states}  # Initialize value function
    policy = {state: None for state in all_states}  # Initialize policy

    iterations = 0
    delta_list = []  # To track convergence

    while iterations != n:
        W = V
        for state in non_terminal_states:
            old_value = V[state]
            max_value = float('-inf')
            best_action = None
            #print(state)
            for action in env.get_possible_actions(state):
                #print(action)
                transitions = env.get_transition_probabilities(state, action)
                #for i in transitions:
                  #print(i)
                action_value = sum(
                    prob * (reward + W[next_state])
                    for next_state, prob, reward in transitions
                )
                """action_value = 0
                for next_state, prob, reward in transitions:
                  print(f"Transition: next_state={next_state}, prob={prob}, reward={reward}")
                  action_value += prob * (reward + gamma * V[next_state])"""
                if action_value > max_value:
                    max_value = action_value
                    best_action = action

            V[state] = max_value
            policy[state] = best_action

        diff = max(abs(V_opt[key] - V[key]) for key in V)
        delta_list.append(diff)
        iterations += 1
        if diff < theta:
            break

    return V, policy, iterations, delta_list

In [None]:
bj = Sol_Env()
start_time = time.time()
V_vi, policy_vi, iterations_vi, deltas_vi = value_iteration(bj, V_opt)
end_time = time.time()

print(f"Value Iteration completed in {end_time - start_time:.2f} seconds and {iterations_vi} iterations.")
# Plot convergence
def plot_convergence(deltas, label, name):
    plt.figure(figsize=(10, 6))
    plt.plot(deltas)
    #plt.plot(range(1, len(deltas)+1), deltas)
    plt.xlabel('Iteration')
    plt.ylabel('Delta')
    plt.title(label)
    plt.legend()
    plt.grid()
    plt.savefig(name, format="jpg", dpi=300)
    plt.show()
plot_convergence(deltas_vi, 'Convergence of Value Iteration', "value_iteration.jpg")

Gauss-Seidel

In [None]:
def gauss_seidel(env, V_opt, n = 10000, theta=1e-6):
    """Perform value iteration to find the optimal policy."""
    # Get all states and filter out terminal ones
    all_states = env.get_all_states()
    non_terminal_states = [state for state in all_states if state[3] != 3]
    # Define additional states to include in V and policy
    additional_states = [
    (2, 2, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
    for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
    for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states

    # Define additional states to include in V and policy
    additional_states2 = [
        (3, 3, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
        for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
        for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states2

    V = {state: 0 for state in all_states}  # Initialize value function
    policy = {state: None for state in all_states}  # Initialize policy

    iterations = 0
    delta_list = []  # To track convergence

    while iterations != n:
        for state in non_terminal_states:
            old_value = V[state]
            max_value = float('-inf')
            best_action = None
            #print(state)
            for action in env.get_possible_actions(state):
                #print(action)
                transitions = env.get_transition_probabilities(state, action)
                #for i in transitions:
                  #print(i)
                action_value = sum(
                    prob * (reward + V[next_state])
                    for next_state, prob, reward in transitions
                )
                """action_value = 0
                for next_state, prob, reward in transitions:
                  print(f"Transition: next_state={next_state}, prob={prob}, reward={reward}")
                  action_value += prob * (reward + gamma * V[next_state])"""
                if action_value > max_value:
                    max_value = action_value
                    best_action = action

            V[state] = max_value
            policy[state] = best_action

        diff = max(abs(V_opt[key] - V[key]) for key in V)
        delta_list.append(diff)
        iterations += 1
        if diff < theta:
            break

    return V, policy, iterations, delta_list

In [None]:
# Main Execution
bj = Sol_Env()

start_time = time.time()
V_gs, policy_gs, iterations_gs, deltas_gs = gauss_seidel(bj, V_opt)
end_time = time.time()

print(f"Gauss-Seidel completed in {end_time - start_time:.2f} seconds and {iterations_gs} iterations.")
#print(deltas)
plot_convergence(deltas_gs, 'Convergence of Gauss-Seidel', "gauss-seidel.jpg")

Optimistic Policy iteration

In [None]:
def opt_policy_iteration(env, V_opt, n1=50, theta=1e-6, gamma=0.9):
    """Perform policy iteration to find the optimal policy."""
    all_states = env.get_all_states()
    non_terminal_states = [state for state in all_states if state[3] != 3]
    # Define additional states to include in V and policy
    additional_states = [
    (2, 2, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
    for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
    for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states

    # Define additional states to include in V and policy
    additional_states2 = [
        (3, 3, dealer_card, 1, 0, 0, dealer_ace, 0, 0, 0, 1)
        for dealer_card in range(4, 27)  # Iterate over all possible dealer cards
        for dealer_ace in [False, True]  # Iterate over both possible values of dealer_ace
    ]

    # Combine all states
    all_states += additional_states2

    # Initialize policy and value function
    policy = {}
    for state in all_states:
        possible_actions = env.get_possible_actions(state)
        if possible_actions:  # Ensure there are valid actions
            policy[state] = possible_actions[0]  # Default to the first action
        else:
            policy[state] = None  # No action for terminal states

    V = {state: 0 for state in all_states}

    iterations1 = 0  # To track convergence iterations
    delta_list = []  # To track convergence
    delta_list2 = []

    while True:  # Policy Iteration
        # Policy Evaluation
        delta_help = []
        while True:
            delta = 0
            for state in non_terminal_states:
                old_value = V[state]
                action = policy[state]

                if action is None:
                    continue  # Skip states with no valid action

                transitions = env.get_transition_probabilities(state, action)

                if not transitions:
                    continue  # Skip if no transitions exist

                # Use proper gamma (not 0.01)
                V[state] = sum(prob * (reward + V[next_state])
                               for next_state, prob, reward in transitions)

                delta = max(delta, abs(old_value - V[state]))
                #print(delta)

            diff = max(abs(V_opt[key] - V[key]) for key in V)
            #print(diff)
            delta_list.append(diff)
            delta_help.append(diff)

            if delta < theta:
                break  # Converged

        # Policy Improvement
        policy_stable = True
        for state in non_terminal_states:
            old_action = policy[state]
            max_value = float('-inf')
            best_action = None

            for action in env.get_possible_actions(state):
                transitions = env.get_transition_probabilities(state, action)
                if not transitions:
                    continue

                action_value = sum(prob * (reward + V[next_state])
                                   for next_state, prob, reward in transitions)

                if action_value > max_value:
                    max_value = action_value
                    best_action = action

            if best_action is not None:
                policy[state] = best_action
                if old_action != best_action:
                    policy_stable = False

        delta_list2.append(delta_help)
        iterations1 += 1
        print(f"Iteration: {iterations1}")

        if iterations1 == n1:
            break

    return V, policy, iterations1, delta_list, delta_list2

In [None]:
bj = Sol_Env()

start_time = time.time()
V_pi, policy_pi, iterations_pi, deltas_pi, deltas2_pi = opt_policy_iteration(bj, V_opt)
end_time = time.time()

print(f"Optimistic Policy Iteration completed in {end_time - start_time:.2f} seconds and {iterations_pi} iterations.")
#print(deltas)
plot_convergence(deltas_pi, 'Convergence of Policy Iteration', 's.jpg')

In [None]:
def plot_convergence_continuous(deltas2):
    plt.figure(figsize=(10, 6))

    cumulative_index = 0  # Initialize the cumulative index
    full_x = []  # To store the continuous x-axis
    full_y = []  # To store the continuous y-axis

    for i, sublist in enumerate(deltas2, start=1):
        # Compute the range of x values for the sublist
        x_range = list(range(cumulative_index, cumulative_index + len(sublist)))

        # Extend the full x and y lists
        full_x.extend(x_range)
        full_y.extend(sublist)

        # Calculate the mid-point of the y values in the current segment
        segment_y_min = min(full_y)
        segment_y_max = max(full_y)
        segment_y_mid = (segment_y_min + segment_y_max) / 2

        # Update cumulative index
        cumulative_index += len(sublist)

        # Add a vertical line at the cumulative last index of the sublist
        plt.axvline(x=cumulative_index - 1, color='red', linestyle='--')

        # Add a label near the vertical line, centered vertically
        plt.text(cumulative_index - 1 + 0.2, segment_y_mid, f"Evaluation Iteration {i}",
                 color='red', fontsize=10, ha='left', va='center')

    # Plot the continuous data
    plt.plot(full_x, full_y)

    plt.xlabel('Improvement Iterations')
    plt.ylabel('Delta')
    plt.title('Convergence of Optimistic Policy Iteration')
    plt.grid()
    plt.savefig("opt_policy_iteration.jpg", format="jpg", dpi=300)
    plt.show()

plot_convergence_continuous(deltas2_pi)

###Additional stuff

In [None]:
import matplotlib.pyplot as plt

# Data from the user
distr = [1 / 13] * 8 + [4 / 13] + [1 / 13]  # Probabilities for cards 2–11
card_values = list(range(2, 12))  # Cards are valued from 2 to 11
distr_dict = {card_values[i]: distr[i] for i in range(len(card_values))}

# Creating the histogram plot
plt.figure(figsize=(10, 6))
bars = plt.bar(distr_dict.keys(), distr_dict.values(), color='skyblue', edgecolor='black')

# Adding labels and title
plt.xlabel("Card Values", fontsize=12)
plt.ylabel("Probability", fontsize=12)
plt.title("Probability Distribution of Card Values", fontsize=14)
plt.xticks(card_values, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Adding values on top of the bars
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # X position
        height,  # Y position
        f"{height:.3f}",  # Text to display
        ha='center',  # Horizontal alignment
        va='bottom',  # Vertical alignment
        fontsize=10
    )

# Save the plot as a JPG or PNG file
plt.savefig("probability_distribution_with_values.jpg", format="jpg", dpi=300)  # Save as JPG
# plt.savefig("probability_distribution_with_values.png", format="png", dpi=300)  # Save as PNG

# Display the plot
plt.show()