In [1]:
import numpy as np
import random

****Q learning - Blackjack for single player****


**Environment:**
State -> (total value of cards in players hand , dealer's upcard). Actions -> 0 - stand; 1 - hit

**Q table:**
Q table is created using dict structure and is divided into 2 parts - wheteher we have an Ace card in our 'hands' - Ace has the ability to have a value of 1 or 11 (we decide that value and can be changed throughout a game). 1st part includes Ace (so called soft-hands) and 2nd dont (so called hard hands). Every q value in Q table initially has assigned value of 0 except for a state when we already got a 'blacjack' (21)

**Rewards:**
As algorithm is very sensitive to values of rewards, we had difficulties to choose exact values, so we decided to look for at least decent set of values by using simulation potential values

**Strategy:**
Rewards are given wheter we have won/drawn/lost/lost by bust and also whether action taken follows alredy known strategy (taken from the Internet) (these type of reward is much lower than the reward given at the end)

**Training (learning of Q table):**
1. Randomly select *initial* state (our and dealer's cards)
2. Select action for current state by searching in potential actions for that state (we only have 2 possible actions 0/1 - overview is little shallow) - using epsilon greedy algorithm for potential exploration
3. Choose adequate reward for taken action
4. Update Q table using Bellman equation - if we  have decided to choose action 0 (stand), there are none potential future states, so we substitute value of 0 for maxQ(s',a') and we finish our move - its the end of an episode
5. If a = 1, we perform hit action - 1 additional card is drawn and new state becomes our current state and we go back to 2.


In [61]:
class BlackJack:
    def __init__(self):
        self.alfa = 0.02137
        self.gamma = 0.81
    
        #states: (Sum of cards | visible dealers card (upcard))
        #actions: 1 - Hit, 0 - Stand   
        #initial q values - 0 (all)     
        #Q table - 2 sections - with an Ace and with no Ace 
        #I
        self.Q_table_no_Ace = {}
        for i in range(4, 22):                                  #sum of card values in our hands
            for j in range(2, 11):                              #upcard - dealer's upcard is not an Ace
                    self.Q_table_no_Ace[(i, j)] = {}
                    for a in [1, 0]:                           
                        self.Q_table_no_Ace[(i, j)][a] = 0  
                    if i == 21:
                        self.Q_table_no_Ace[(i, j)][0] = 100
                        self.Q_table_no_Ace[(i, j)][0] = -50
                        
                        
            self.Q_table_no_Ace[(i, 'A')] = {}                  #upcard - dealer's upcard is an Ace
            for a in [1, 0]:                                    
                self.Q_table_no_Ace[(i, 'A')][a] = 0            
            if i == 21:
                self.Q_table_no_Ace[(i, 'A')][0] = 100
                self.Q_table_no_Ace[(i, 'A')][0] = -50
    
        for j in range(2, 11):
            self.Q_table_no_Ace[('bust', j)] = {}
            for a in [1, 0]:                           
                self.Q_table_no_Ace[('bust', j)][a] = 0  
        
        self.Q_table_no_Ace[('bust', 'A')] = {}                
        for a in [1, 0]:                                    
            self.Q_table_no_Ace[('bust', 'A')][a] = 0 
                
                    
        #II      
        self.Q_table_with_Ace = {}
        self.hands = [('AA', 12), ('A2', 13), ('A3', 14), ('A4', 15), ('A5', 16), ('A6', 17), ('A7', 18), ('A8', 19), ('A9', 20), ('A10', 21)]   
        for hand in self.hands:
            for j in range(2, 11):
                self.Q_table_with_Ace[(hand, j)] = {}
                for a in [1, 0]:
                    self.Q_table_with_Ace[(hand, j)][a] = 0
                if hand == ('A10', 21):
                    self.Q_table_with_Ace[(hand, j)][0] = 100
                    self.Q_table_with_Ace[(hand, j)][1] = -50
            
            self.Q_table_with_Ace[(hand, 'A')] = {}
            for a in [1, 0]:
                self.Q_table_with_Ace[(hand, 'A')][a] = 0
            if hand == ('A10', 21):
                self.Q_table_with_Ace[(hand, j)][0] = 100
                self.Q_table_with_Ace[(hand, j)][1] = -50    
                
        for j in range(2, 11):
            self.Q_table_with_Ace[('bust', j)] = {}
            for a in [1, 0]:                           
                self.Q_table_with_Ace[('bust', j)][a] = 0  
        
        self.Q_table_with_Ace[('bust', 'A')] = {}                
        for a in [1, 0]:                                    
            self.Q_table_with_Ace[('bust', 'A')][a] = 0 
        
        
        #Strategy 'table' (dict) | (state): action that makes reward = 1
        
        #yes, we know...
        self.strategy_without_ace = {
            (4, 2): 1, (4, 3): 1, (4, 4): 1, (4, 5): 1, (4, 6): 1, (4, 7): 1, (4, 8): 1, (4, 9): 1, (4, 10): 1, (4, 'A'): 1,
            (5, 2): 1, (5, 3): 1, (5, 4): 1, (5, 5): 1, (5, 6): 1, (5, 7): 1, (5, 8): 1, (5, 9): 1, (5, 10): 1, (5, 'A'): 1,
            (6, 2): 1, (6, 3): 1, (6, 4): 1, (6, 5): 1, (6, 6): 1, (6, 7): 1, (6, 8): 1, (6, 9): 1, (6, 10): 1, (6, 'A'): 1,
            (7, 2): 1, (7, 3): 1, (7, 4): 1, (7, 5): 1, (7, 6): 1, (7, 7): 1, (7, 8): 1, (7, 9): 1, (7, 10): 1, (7, 'A'): 1,
            (8, 2): 1, (8, 3): 1, (8, 4): 1, (8, 5): 1, (8, 6): 1, (8, 7): 1, (8, 8): 1, (8, 9): 1, (8, 10): 1, (8, 'A'): 1,
            (9, 2): 1, (9, 3): 1, (9, 4): 1, (9, 5): 1, (9, 6): 1, (9, 7): 1, (9, 8): 1, (9, 9): 1, (9, 10): 1, (9, 'A'): 1,
            (10, 2): 1, (10, 3): 1, (10, 4): 1, (10, 5): 1, (10, 6): 1, (10, 7): 1, (10, 8): 1, (10, 9): 1, (10, 10): 1, (10, 'A'): 1,
            (11, 2): 1, (11, 3): 1, (11, 4): 1, (11, 5): 1, (11, 6): 1, (11, 7): 1, (11, 8): 1, (11, 9): 1, (11, 10): 1, (11, 'A'): 1,
            (12, 2): 1, (12, 3): 1, (12, 4): 0, (12, 5): 0, (12, 6): 0, (12, 7): 1, (12, 8): 1, (12, 9): 1, (12, 10): 1, (12, 'A'): 1,
            (13, 2): 0, (13, 3): 0, (13, 4): 0, (13, 5): 0, (13, 6): 0, (13, 7): 1, (13, 8): 1, (13, 9): 1, (13, 10): 1, (13, 'A'): 1,
            (14, 2): 0, (14, 3): 0, (14, 4): 0, (14, 5): 0, (14, 6): 0, (14, 7): 1, (14, 8): 1, (14, 9): 1, (14, 10): 1, (14, 'A'): 1,
            (15, 2): 0, (15, 3): 0, (15, 4): 0, (15, 5): 0, (15, 6): 0, (15, 7): 1, (15, 8): 1, (15, 9): 1, (15, 10): 1, (15, 'A'): 1,
            (16, 2): 0, (16, 3): 0, (16, 4): 0, (16, 5): 0, (16, 6): 0, (16, 7): 1, (16, 8): 1, (16, 9): 1, (16, 10): 1, (16, 'A'): 1,
            (17, 2): 0, (17, 3): 0, (17, 4): 0, (17, 5): 0, (17, 6): 0, (17, 7): 0, (17, 8): 0, (17, 9): 0, (17, 10): 0, (17, 'A'): 1,
            (18, 2): 0, (18, 3): 0, (18, 4): 0, (18, 5): 0, (18, 6): 0, (18, 7): 0, (18, 8): 0, (18, 9): 0, (18, 10): 0, (18, 'A'): 0,
            (19, 2): 0, (19, 3): 0, (19, 4): 0, (19, 5): 0, (19, 6): 0, (19, 7): 0, (19, 8): 0, (19, 9): 0, (19, 10): 0, (19, 'A'): 0,
            (20, 2): 0, (20, 3): 0, (20, 4): 0, (20, 5): 0, (20, 6): 0, (20, 7): 0, (20, 8): 0, (20, 9): 0, (20, 10): 0, (20, 'A'): 0,
            (21, 2): 0, (21, 3): 0, (21, 4): 0, (21, 5): 0, (21, 6): 0, (21, 7): 0, (21, 8): 0, (21, 9): 0, (21, 10): 0, (21, 'A'): 0,
            ('bust', 2): 0, ('bust', 3): 0, ('bust', 4): 0, ('bust', 5): 0, ('bust', 6): 0, ('bust', 7): 0, ('bust', 8): 0, ('bust', 9): 0, ('bust', 10): 0, ('bust', 'A'): 0  
            } 
        
        self.strategy_with_ace = {
            (12, 2): 1, (12, 3): 1, (12, 4): 1, (12, 5): 1, (12, 6): 1,
            (12, 7): 1, (12, 8): 1, (12, 9): 1, (12, 10): 1, (12, 'A'): 1,
            (13, 2): 1, (13, 3): 1, (13, 4): 1, (13, 5): 1, (13, 6): 1,
            (13, 7): 1, (13, 8): 1, (13, 9): 1, (13, 10): 1, (13, 'A'): 1,
            (14, 2): 1, (14, 3): 1, (14, 4): 1, (14, 5): 1, (14, 6): 1,
            (14, 7): 1, (14, 8): 1, (14, 9): 1, (14, 10): 1, (14, 'A'): 1,
            (15, 2): 1, (15, 3): 1, (15, 4): 1, (15, 5): 1, (15, 6): 1,
            (15, 7): 1, (15, 8): 1, (15, 9): 1, (15, 10): 1, (15, 'A'): 1,
            (16, 2): 1, (16, 3): 1, (16, 4): 1, (16, 5): 1, (16, 6): 1,
            (16, 7): 1, (16, 8): 1, (16, 9): 1, (16, 10): 1, (16, 'A'): 1,
            (17, 2): 1, (17, 3): 1, (17, 4): 1, (17, 5): 1, (17, 6): 1,
            (17, 7): 1, (17, 8): 1, (17, 9): 1, (17, 10): 1, (17, 'A'): 1,
            (18, 2): 1, (18, 3): 1, (18, 4): 1, (18, 5): 1, (18, 6): 1,
            (18, 7): 0, (18, 8): 0, (18, 9): 1, (18, 10): 1, (18, 'A'): 1,
            (19, 2): 0, (19, 3): 0, (19, 4): 0, (19, 5): 0, (19, 6): 0,
            (19, 7): 0, (19, 8): 0, (19, 9): 0, (19, 10): 0, (19, 'A'): 0,
            (20, 2): 0, (20, 3): 0, (20, 4): 0, (20, 5): 0, (20, 6): 0,
            (20, 7): 0, (20, 8): 0, (20, 9): 0, (20, 10): 0, (20, 'A'): 0,
            (21, 2): 0, (21, 3): 0, (21, 4): 0, (21, 5): 0, (21, 6): 0,
            (21, 7): 0, (21, 8): 0, (21, 9): 0, (21, 10): 0, (21, 'A'): 0,
            ('bust', 2): 0, ('bust', 3): 0, ('bust', 4): 0, ('bust', 5): 0, ('bust', 6): 0,
            ('bust', 7): 0, ('bust', 8): 0, ('bust', 9): 0, ('bust', 10): 0, ('bust', 'A'): 0,
        }     
        
##############################################################
                                                          
    def give_card(self):
        cards = list(np.arange(2, 10)) + [10]*4 + ['A']
        return np.random.choice(cards)
    
##############################################################
   
    # performing HIT action - to the current state
    def get_next_states_hit(self, state):
           
        possible_cards = list(np.arange(2, 10)) + [10]*4 + ['A']
        valid_states = []
        
        for card in possible_cards:
            if self.ace_flag == True:
                state_value = [s[1] for s in self.hands if s[0] == state][0]
                if card != 'A':
                    v_state = state_value + card
                    if v_state <= 21:
                        valid_states.append((v_state, True))
                    else:      
                        valid_states.append((v_state-10, False))
                else:
                    v_state = state_value + 11
                    if v_state <= 21:
                        valid_states.append((v_state, True))
                    else:
                        new_v_state = v_state - 10
                        if new_v_state <= 21:
                            valid_states.append((new_v_state, True))
                        else:
                            vv = new_v_state - 10
                            if vv <= 21:
                                valid_states.append((vv, False))
                            else:
                                valid_states.append(('bust', False))
        
            else:
                state_value = state
                if card != 'A':
                    v_state = state_value + card
                    if v_state <= 21:
                        valid_states.append((v_state, False))
                    else:
                        valid_states.append(("bust", False))
                else:
                    v_state = state_value + 11
                    if v_state <= 21:
                        valid_states.append((v_state, True))
                    else:
                        vv = v_state - 10
                        if vv <= 21:
                            valid_states.append((vv, False))
                        else:
                            valid_states.append(('bust', False))
                        
                                                      
        return valid_states
    
    # performing STAND action - to the current state
    def get_next_states_stand(self, state):
            
        valid_states = []
        valid_states.append(state)
        
        return valid_states
      
##############################################################
    #get max Q(s', a')
    
    def get_possible_q_values(self, valid_states_stand, valid_states_hit, upcard):
        q_values = []
        if upcard != 'A':
            upcard = int(upcard)

        # s' | a - hit
        for next_state in valid_states_hit:
     
            nstate, ace = next_state[0], next_state[1]
            if ace is True:
                state_value = [s for s in self.hands if s[1] == nstate][0]
                q_value_hit = self.Q_table_with_Ace[(state_value, upcard)][1]
                q_value_stand = self.Q_table_with_Ace[(state_value, upcard)][0]
                # Q(s', a'- hit) | Q(s', a'- stand) | a - hit | an Ace in hand
                q = (q_value_hit, q_value_stand, 1, True)
                q_values.append(q)
                
            else:
                state_value = nstate
                q_value_hit = self.Q_table_no_Ace[(state_value, upcard)][1]
                q_value_stand = self.Q_table_no_Ace[(state_value, upcard)][0]
                
                q = (q_value_hit, q_value_stand, 1, False)
                q_values.append(q)  
         
        # s' | a - stand       
        for next_state in valid_states_stand:
            nstate = next_state
            if type(nstate) == str:
   
                state = [s for s in self.hands if s[0] == nstate][0]
                q_value_hit = self.Q_table_with_Ace[(state, upcard)][1]
                q_value_stand = self.Q_table_with_Ace[(state, upcard)][0]
                
                # Q(s', a'- hit) | Q(s', a'- stand) | a - stand | an Ace in hand
                q = (q_value_hit, q_value_stand, 0, True)
                q_values.append(q)
                 
            else:
                q_value_hit = self.Q_table_no_Ace[(nstate, upcard)][1]
                q_value_stand = self.Q_table_no_Ace[(nstate, upcard)][0]
     
                q = (q_value_hit, q_value_stand, 0, False)
                q_values.append(q)
                                                                                                
        return q_values
              
    def get_max_future_q(self, q_values):
  
        max_q = []
        max_value = float('-inf')
                
        for q in q_values:
            q_value_a_prim_hit = q[0]
            q_value_a_prim_stand = q[1]
               
            if q_value_a_prim_hit > max_value:
                max_value = q_value_a_prim_hit
                max_q = []
                max_q.append(q_value_a_prim_hit)

            if q_value_a_prim_hit == max_value:
                max_q.append(q_value_a_prim_hit)     
            if q_value_a_prim_stand > max_value:
                max_value = q_value_a_prim_stand
                max_q = []
                max_q.append(q_value_a_prim_stand)
            if q_value_a_prim_stand == max_value:
                max_q.append(q_value_a_prim_stand)    
            
        if len(max_q) > 1:
            max_value = random.choice(max_q)

        else:
            max_value = max_value

        return max_value    
    
##############################################################

    def choose_next_action(self, state, upcard, ace, epsilon):
        #print("SSSSSSS", state)
        
        if state == 'bust':
            action = 0
            
        if epsilon > random.random():
            if ace == True:
                state = [s for s in self.hands if s[0] == state][0]
                av_stand = self.Q_table_with_Ace[(state, upcard)][0]
                av_hit = self.Q_table_with_Ace[(state, upcard)][1]
                
                if av_stand > av_hit:
                    action = 0
                elif av_stand < av_hit:
                    action = 1
                else:
                    action = random.choice([0, 1])
            else:
                v_stand = self.Q_table_no_Ace[(state, upcard)][0]
                v_hit = self.Q_table_no_Ace[(state, upcard)][0]
                
                if v_stand > v_hit:
                    action = 0
                elif v_stand < v_hit:
                    action = 1
                else:
                    action = random.choice([0, 1])
        else:
            action = random.choice([0, 1])
                   
        return action

##############################################################

    def get_a_prim(self, state, upcard, ace):
        if state == 'bust':
            a_prim = 0
        elif type(state) == str:
            ss = [s for s in self.hands if s[0] == state][0]
            av_stand = self.Q_table_with_Ace[(ss, upcard)][0]
            av_hit = self.Q_table_with_Ace[(ss, upcard)][1]
            if av_stand > av_hit:
                a_prim = 0
            elif av_stand < av_hit:
                a_prim = 1
            else:
                a_prim = random.choice([0, 1])
                
        else:
            sv_stand = self.Q_table_no_Ace[(state, upcard)][0]
            sv_hit = self.Q_table_no_Ace[(state, upcard)][1]

            if sv_stand > sv_hit:   
                a_prim = 0
            elif sv_stand < sv_hit:
                a_prim = 1
            else:
                a_prim = random.choice([0, 1])
            
        return a_prim
   
##############################################################
   
    def dealer_policy(self, upcard, second_card):
        card1, card2 = upcard, second_card
        
        player_value_no_Ace = 0
        player_value_with_Ace = ''
        
        if card1 == 'A' or card2 == 'A':
            ace_flag = True
            if card1 == 'A':
                player_value_with_Ace = 'A' + str(card2)
            if card2 == 'A':
                player_value_with_Ace = 'A' + str(card1)
                
        else:
            ace_flag = False
            player_value_no_Ace += int(card1)
            player_value_no_Ace += int(card2)
        
        if ace_flag is True:
            state_value = [s[1] for s in self.hands if s[0] == player_value_with_Ace][0]
        else:
            state_value = player_value_no_Ace
            
        end_flag = 0
        while end_flag == 0:
            if state_value > 21:
                if ace_flag is True:
                    state_value -= 10
                    ace_flag = False

                end_flag += 1
                
            if state_value >= 17:

                end_flag += 1
                
            else:
                card = str(self.give_card())
                if card == 'A':
                    if ace_flag is True:
                        state_value += 1
                    else:
                        state_value += 11
                        ace_flag = True
                else:
                    card = int(card)
                    state_value += card

        return state_value


##############################################################
    def game_with_strategy(self, rounds):
        self.won = 0
        self.draw = 0
        self.lost = 0
        self.bust = 0
        self.epsilon = 1
        
        for i in range(0, rounds):
            #print("round:", i)

            self.ace_flag = False      
            reward = 0
              
            #states
            player_value_no_Ace = 0
            player_value_with_Ace = ''
            
            #give 2 cards to player
            card1 = self.give_card()
            card2 = self.give_card()
            
            if card1 == 'A' or card2 == 'A':
                self.ace_flag = True
                if card1 == 'A':
                    player_value_with_Ace = 'A' + str(card2)
                if card2 == 'A':
                    player_value_with_Ace = 'A' + str(card1)
                
            else:
                player_value_no_Ace += int(card1)
                player_value_no_Ace += int(card2)
                
                
            #give 2 cards to dealer - 1 is visible, 2nd is hidden
            upcard = str(self.give_card())
            dealer_h_card = str(self.give_card())
            
            end = 0
            while end == 0:
                
                if player_value_no_Ace == 'bust':
                    state = 'bust'    
                elif self.ace_flag is True:
                    
                   state = player_value_with_Ace
                   state_value = [s[1] for s in self.hands if s[0] == state][0]
   
                else:
                   state = player_value_no_Ace

              
                if state != 'bust':      

                    if upcard != 'A':
                        upcard = int(upcard)
                    
                    epsilon = 1
                    ace_ = self.ace_flag
               
                    a = self.choose_next_action(state, upcard, ace_, epsilon)
                    
                    if upcard != 'A':
                        upcard = int(upcard)
                
                    if self.ace_flag is True:
                        state_value = [s[1] for s in self.hands if s[0] == state][0]
                    else:
                        state_value = state
                else:
                    a = 0
                    
                
                if a == 0:
                    
                    dlr = self.dealer_policy(upcard, dealer_h_card)
                    if state_value == 'bust' or state_value > 21:
                        self.bust += 1
            
                    elif dlr < state_value or dlr > 21:
                        self.won += 1
                        
                    elif dlr == state_value:
                        self.draw += 1
                        
                    else:
                        self.lost += 1
                                               
                    end += 1
                    
                else:
                                          
                    new_card = str(self.give_card())
                                      
                    
                    #saving ace flag value - so we can know whether current state had the possibilty of -10 to its value
                    ace_flag_2 = self.ace_flag

                    if new_card != 'A':
                        new_card = int(new_card)
                        
                        state_value += new_card
                        if state_value > 21:
                            if self.ace_flag is True:
                                state_value -= 10
                                new_state = state_value
                                self.ace_flag = False
                            else:
                                new_state = 'bust'
                        else:                               
                            if self.ace_flag is True:
                                new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                            else:
                                new_state = state_value
                    else:
                        if self.ace_flag is True:
                            state_value = [s[1] for s in self.hands if s[0] == state][0]
                            state_value += 11
                            if state_value > 21:
                                state_value -= 10
            
                                if state_value <= 21:
                                    new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                                else:
                                    new_state = 'bust'
                        else:
                            state_value += 11
                            if state_value > 21:
                                n_state_value = state_value - 10
                                #new_state = state_value
                                if n_state_value > 21:
                                    new_state = 'bust'
                                else:
                                    new_state = n_state_value
                                                           
                            else:
                                new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                                
                    if type(new_state) != str:
                        self.ace_flag = False
                    else:
                        self.ace_flag = True            
                        
                    if new_state == 'bust':
                        player_value_no_Ace = new_state
                        player_value_with_Ace = new_state
                    elif type(new_state) == str:
                        self.ace_flag = True
                        player_value_with_Ace = new_state
                    else:
                        self.ace_flag = False
                        player_value_no_Ace = new_state
                        
                                    
        print(f"\n WON: {self.won}, LOST: {self.lost}, BUSTED: {self.bust}, DRAW: {self.draw} \n") 
        return

#############################################################

    def game_with_strategy_comments_on(self, rounds):
        self.epsilon = 1
        
        for i in range(0, rounds):
            print("round:", i)

            self.ace_flag = False      
            reward = 0
              
            #states
            player_value_no_Ace = 0
            player_value_with_Ace = ''
            
            #give 2 cards to player
            card1 = self.give_card()
            card2 = self.give_card()
            
            print("user cards:", card1, " - ", card2)
             
            if card1 == 'A' or card2 == 'A':
                self.ace_flag = True
                if card1 == 'A':
                    player_value_with_Ace = 'A' + str(card2)
                if card2 == 'A':
                    player_value_with_Ace = 'A' + str(card1)
                
            else:
                player_value_no_Ace += int(card1)
                player_value_no_Ace += int(card2)
                
                
            #give 2 cards to dealer - 1 is visible, 2nd is hidden
            upcard = str(self.give_card())
            dealer_h_card = str(self.give_card())
            
            
            print("upcard:", upcard)
            
            end = 0
            while end == 0:
                print("-----------------------------------------------")
                
                if player_value_no_Ace == 'bust':
                    state = 'bust'    
                elif self.ace_flag is True:
                    
                    state = player_value_with_Ace
                    state_value = [s[1] for s in self.hands if s[0] == state][0]
                    print("STATE:", state)
                    print("STATE VALUE:", state_value)
   
                else:
                    state = player_value_no_Ace
                    print("STATE:", state)
              
                if state != 'bust':      

                    if upcard != 'A':
                        upcard = int(upcard)
                    
                    epsilon = 1
                    ace_ = self.ace_flag
               
                    a = self.choose_next_action(state, upcard, ace_, epsilon)
                    
                    
                    print("state:", state, ' - flag:', self.ace_flag)
                    
                    if upcard != 'A':
                        upcard = int(upcard)
                
                    if self.ace_flag is True:
                        state_value = [s[1] for s in self.hands if s[0] == state][0]
                    else:
                        state_value = state
                else:
                    a = 0
                    
                
                  
                print("------------ STATE----------", state)
                print("Action taken: ", a, '\n')
                
                if a == 0:
                    
                    dlr = self.dealer_policy(upcard, dealer_h_card)
                    if state_value == 'bust' or state_value > 21:
                        print("YOU BUSTED")
            
                    elif dlr < state_value or dlr > 21:
                        if dlr > 21:
                            dlr = 'bust'
                        print("YOU WON,", dlr, ' < ', state_value)
                        
                    elif dlr == state_value:
                        print("DRAW", dlr, ' - ', state_value)
                        
                    else:
                        print('YOU LOST', dlr, ' > ', state_value)
                     
                     
                    print("____________ STATE_________", state)
                                               
                    end += 1
                    
                else:
                                          
                    new_card = str(self.give_card())
                                          
                    print("current State", state)
                    print("current State Value:", state_value)
                    print("current State AceFlag:", self.ace_flag)
                    print("NEW CARD:", new_card)
                    print("_____     _____")
                     
                    #saving ace flag value - so we can know whether current state had the possibilty of -10 to its value
                    ace_flag_2 = self.ace_flag

                    if new_card != 'A':
                        new_card = int(new_card)
                        
                        state_value += new_card
                        if state_value > 21:
                            if self.ace_flag is True:
                                state_value -= 10
                                new_state = state_value
                                self.ace_flag = False
                            else:
                                new_state = 'bust'
                        else:                               
                            if self.ace_flag is True:
                                new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                            else:
                                new_state = state_value
                    else:
                        if self.ace_flag is True:
                            state_value = [s[1] for s in self.hands if s[0] == state][0]
                            state_value += 11
                            if state_value > 21:
                                state_value -= 10
            
                                if state_value <= 21:
                                    new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                                else:
                                    new_state = 'bust'
                        else:
                            state_value += 11
                            if state_value > 21:
                                n_state_value = state_value - 10
                                #new_state = state_value
                                if n_state_value > 21:
                                    new_state = 'bust'
                                else:
                                    new_state = n_state_value
                                                           
                            else:
                                new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                      
                    print("NEW STATE:", new_state)
                                
                    if type(new_state) != str:
                        self.ace_flag = False
                    else:
                        self.ace_flag = True            
                        
                    if new_state == 'bust':
                        player_value_no_Ace = new_state
                        player_value_with_Ace = new_state
                    elif type(new_state) == str:
                        self.ace_flag = True
                        player_value_with_Ace = new_state
                    else:
                        self.ace_flag = False
                        player_value_no_Ace = new_state
                        
        return

##############################################################
       
    def game_on(self, rounds=10000, rewards=[0, 0, 0, 0, 0, 0]):
        
        rewards_bust, reward_win, reward_draw, reward_lost, reward_strategy, reward_off_strategy = rewards                                                                      
        
        for i in range(0, rounds):
                                  
            #states
            player_value_no_Ace = 0
            player_value_with_Ace = ''
            
            #give 2 cards to player
            card1 = self.give_card()
            card2 = self.give_card()
            #print("user cards:", card1, " - ", card2)
            
            if card1 == 'A' or card2 == 'A':
                self.ace_flag = True
                if card1 == 'A':
                    player_value_with_Ace = 'A' + str(card2)
                if card2 == 'A':
                    player_value_with_Ace = 'A' + str(card1)
                
            else:
                self.ace_flag = False
                player_value_no_Ace += int(card1)
                player_value_no_Ace += int(card2)
                
                
            #give 2 cards to dealer - 1 is visible, 2nd is hidden
            upcard = str(self.give_card())
            dealer_h_card = str(self.give_card())
            
            #print("upcard:", upcard)
            
            end = 0
            while end == 0:
                       
                #CHOSE AN ACTION STAGE - A
                if player_value_no_Ace == 'bust':
                    state = 'bust'    
                elif self.ace_flag is True:
                    
                   state = player_value_with_Ace
                   state_value = [s[1] for s in self.hands if s[0] == state][0]

                   possible_states_hit = self.get_next_states_hit(state)
                   possible_states_stand = self.get_next_states_stand(state)
                else:
                   state = player_value_no_Ace
                   possible_states_hit = self.get_next_states_hit(state)
                   possible_states_stand = self.get_next_states_stand(state)

                #print("------------ STATE----------", state) 
                
                if state != 'bust':     
                     
                    possible_q_values = self.get_possible_q_values(possible_states_stand, possible_states_hit, upcard)
                                 
                    if upcard != 'A':
                        upcard = int(upcard)
                        
                    
                    max_q_value = self.get_max_future_q(possible_q_values)
                    
                    #which action a to choose for current state s
                    ace_ = self.ace_flag
                    epsilon = 0.8 
                    a = self.choose_next_action(state, upcard, ace_, epsilon)
                           
                    if self.ace_flag is True:
                        state_value = [s[1] for s in self.hands if s[0] == state][0]
                        strategy = self.strategy_with_ace[(state_value, upcard)]
                        state_value2 = [s for s in self.hands if s[0] == state][0]
                    else:
                        state_value = state
                        strategy = self.strategy_without_ace[(state_value, upcard)]
                        state_value2 = state
                else:
                    a = 0
                    
                # Rewards stage
      
                if a == 0:
                    
                    dlr = self.dealer_policy(upcard, dealer_h_card)
                    if state_value == 'bust' or state_value > 21:                  
                        reward = rewards_bust
                        b = 1.237
                        #print("YOU BUSTED")     
                    elif dlr < state_value or dlr > 21:
                        #print("YOU WON,", dlr, ' < ', state_value)
                        reward = reward_win
                        b = 1
                    elif dlr == state_value:
                        #print("DRAW", dlr, ' - ', state_value)
                        reward = reward_draw
                        b = 1
                    else:
                        #print('YOU LOST', dlr, ' > ', state_value)
                        reward = reward_lost
                        b = 1.237
                    

                    if self.ace_flag is True:
                        if state != 'bust':
                            st = [s for s in self.hands if s[0] == state][0]
                        else:
                            st = 'bust'
                        
                        self.Q_table_with_Ace[(st, upcard)][0] += b*abs(self.alfa*(reward - self.Q_table_with_Ace[(st, upcard)][0]))
                    else:
                        self.Q_table_no_Ace[(state, upcard)][0] += b*abs(self.alfa*(reward - self.Q_table_no_Ace[(state, upcard)][0]))
                           
                    end += 1
                    
                else:
                    
                    if strategy == a:
                        reward = reward_strategy
                    else:
                        reward = reward_off_strategy

                    new_card = str(self.give_card())
                    
                    #saving ace flag value - so we can know whether current state had the possibilty of -10 to its value
                    ace_flag_2 = self.ace_flag
                    
                    #UPDATING Q TABLE
                    #print("REWARD:", reward)
                    if ace_flag_2 is True:

                        self.Q_table_with_Ace[(state_value2, upcard)][a] += abs(self.alfa*(reward + self.gamma*( max_q_value - self.Q_table_with_Ace[(state_value2, upcard)][a])))                 
                    else:
                        self.Q_table_no_Ace[(state_value2, upcard)][a] += abs(self.alfa*(reward + self.gamma*( max_q_value - self.Q_table_no_Ace[(state_value2, upcard)][a]))) 
                        
                        
                    # GOING TO NEXT STATE - S'
                    if new_card != 'A':
                        new_card = int(new_card)
                        
                        state_value += new_card
                        if state_value > 21:
                            if self.ace_flag is True:
                                state_value -= 10
                                new_state = state_value
                                self.ace_flag = False
                            else:
                                new_state = 'bust'
                        else:                               
                            if self.ace_flag is True:
                                new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                            else:
                                new_state = state_value
                    else:
                        if self.ace_flag is True:
                            state_value = [s[1] for s in self.hands if s[0] == state][0]
                            state_value += 11
                            if state_value > 21:
                                state_value -= 10
                      
                                if state_value <= 21:
                                    new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                                else:
                                    new_state = 'bust'
                        else:
                            state_value += 11
                            if state_value > 21:
                                n_state_value = state_value - 10
                                
                                if n_state_value > 21:
                                    new_state = 'bust'
                                else:
                                    new_state = n_state_value
                                                           
                            else:
                                new_state = [s[0] for s in self.hands if s[1] == state_value][0]
                                
                    #print("NEW STATE:", new_state, type(new_state)) 
                    
                    if type(new_state) != str:
                        self.ace_flag = False
                    else:
                        self.ace_flag = True
            
                    if new_state == 'bust':
                        player_value_no_Ace = new_state
                        player_value_with_Ace = new_state
                    elif type(new_state) == str:
                        self.ace_flag = True
                        player_value_with_Ace = new_state
                    else:
                        self.ace_flag = False
                        player_value_no_Ace = new_state
                                       
                               

examplary results

In [62]:
rewards_bust = -200
reward_win = 80
reward_draw = 8
reward_lost = -200
reward_strategy = 5
reward_off_strategy = -5
rewards = [rewards_bust, reward_win, reward_draw, reward_lost, reward_strategy, reward_off_strategy]

rounds = 300000                          
bj = BlackJack()         
bj.game_on(rounds=rounds, rewards=rewards)               

print("DONE")                                     
                       
#print("Q no ace:", bj.Q_table_no_Ace)
print("Q no ace:")
for q in bj.Q_table_no_Ace:
    print(q, ' - ', bj.Q_table_no_Ace[q], '\n')
    
print(" - \n\n - ")
#print("Q with ace:", bj.Q_table_with_Ace)
print("Q with ace:")
for q in bj.Q_table_with_Ace:
    print(q, ' - ', bj.Q_table_with_Ace[q], '\n')

DONE
Q no ace:
(4, 2)  -  {1: 9852176760443046.0, 0: 351.4536945628529} 

(4, 3)  -  {1: 1924058774803188.2, 0: 217.82216554861864} 

(4, 4)  -  {1: 124656140454523.52, 0: 336.612760051022} 

(4, 5)  -  {1: 1337589768338484.8, 0: 300.95308259968766} 

(4, 6)  -  {1: 946687708889737.0, 0: 529.5750276273725} 

(4, 7)  -  {1: 616916964386094.2, 0: 590.2634364334716} 

(4, 8)  -  {1: 208802130162741.3, 0: 567.7843155059982} 

(4, 9)  -  {1: 704452755477739.4, 0: 877.8581340755028} 

(4, 10)  -  {1: 5.558051787088396e+61, 0: 228754.90032130913} 

(4, 'A')  -  {1: 1.0733531951008211e+17, 0: 817.5243642641258} 

(5, 2)  -  {1: 1.0317929838188914e+16, 0: 3227.0806805601724} 

(5, 3)  -  {1: 5925250568933110.0, 0: 1949.3462179022608} 

(5, 4)  -  {1: 302871814290122.3, 0: 3705.399502409078} 

(5, 5)  -  {1: 578446683120983.1, 0: 3439.202604639055} 

(5, 6)  -  {1: 843153618352351.2, 0: 2789.7751634617084} 

(5, 7)  -  {1: 563919777910733.9, 0: 3107.4267984142857} 

(5, 8)  -  {1: 66310362382216

In [74]:
bj.game_with_strategy(rounds=1000)


 WON: 255, LOST: 347, BUSTED: 358, DRAW: 40 



Looking for decent set of rewards values

In [29]:
rewards_win = [50, 100, 200, 300, 500, 1000]
rewards_bust = [-100, -150, -200, -400, -800, -1000]
rewards_lost = [-80, -140, -400, -900 ]
rewards_draw = [20, 40]
rewards_strategy = [1, 5, 10]
rewards_off = [-1, -2.5, -5]

rewards_ops = []
for r_win in rewards_win:
    for r_bust in rewards_bust:
        for r_lost in rewards_lost:
            for r_draw in rewards_draw:
                for r_s in rewards_strategy:
                    for r_o_s in rewards_off:
                        r = [r_bust, r_win, r_draw, r_lost, r_s, r_o_s]
                        rewards_ops.append(r)
                        #print(r)


print(len(rewards_ops))

2592


In [75]:
x = 50
rewards = random.sample(rewards_ops, x)

print(rewards)


rounds = 350000 
y = 1000
bjs = []

for r in rewards:
    bj = BlackJack()                              
    bj.game_on(rounds=rounds, rewards=r)
    print("REWARDS:", r)
    bj.game_with_strategy(y)  
     
    if bj.won >= 250:    
        print("!!!!!!!!!!!")
        bjs.append(bj)        
        
    print("________________\n")
    
print("DONE")     

[[-150, 300, 40, -900, 1, -5], [-1000, 100, 40, -140, 5, -2.5], [-100, 500, 40, -400, 1, -1], [-200, 100, 40, -80, 1, -2.5], [-200, 200, 20, -80, 5, -5], [-100, 500, 40, -80, 1, -1], [-150, 300, 20, -140, 5, -2.5], [-200, 1000, 20, -80, 5, -1], [-200, 300, 20, -140, 5, -2.5], [-1000, 50, 20, -140, 10, -5], [-100, 50, 40, -140, 1, -1], [-800, 300, 20, -900, 5, -1], [-200, 500, 20, -900, 1, -5], [-150, 300, 20, -900, 10, -5], [-1000, 200, 20, -140, 5, -5], [-400, 50, 20, -80, 5, -2.5], [-200, 300, 20, -900, 10, -1], [-100, 100, 20, -900, 10, -5], [-100, 100, 20, -140, 10, -5], [-800, 100, 20, -900, 5, -2.5], [-200, 1000, 40, -900, 5, -2.5], [-150, 300, 40, -400, 1, -2.5], [-1000, 1000, 20, -80, 5, -2.5], [-200, 50, 20, -80, 10, -5], [-800, 1000, 40, -900, 5, -5], [-150, 200, 40, -900, 5, -2.5], [-150, 100, 40, -80, 1, -1], [-150, 100, 20, -900, 10, -5], [-1000, 1000, 40, -140, 5, -2.5], [-400, 1000, 20, -400, 10, -1], [-150, 50, 20, -140, 1, -5], [-200, 1000, 40, -80, 5, -1], [-400, 300,

In [76]:
if bjs == []:
    rewards = [-800, 50, 20, -900, 10, -2.5] #random set
    bj = BlackJack()                              
    bj.game_on(rounds=rounds, rewards=rewards)
else:
    bj = random.choice(bjs)
    

In [85]:
avg = 0
rnd = 10
for i in range(rnd):
    bj.game_with_strategy(rounds = 1000)
    avg += bj.won


print("Average games won per 1000 games:", avg / rnd)
    


 WON: 254, LOST: 342, BUSTED: 362, DRAW: 42 


 WON: 260, LOST: 368, BUSTED: 336, DRAW: 36 


 WON: 248, LOST: 372, BUSTED: 346, DRAW: 34 


 WON: 250, LOST: 375, BUSTED: 342, DRAW: 33 


 WON: 247, LOST: 329, BUSTED: 367, DRAW: 57 


 WON: 282, LOST: 350, BUSTED: 333, DRAW: 35 


 WON: 252, LOST: 373, BUSTED: 326, DRAW: 49 


 WON: 270, LOST: 342, BUSTED: 340, DRAW: 48 


 WON: 242, LOST: 347, BUSTED: 367, DRAW: 44 


 WON: 242, LOST: 370, BUSTED: 349, DRAW: 39 

Average games won per 1000 games: 254.7


In [78]:
bj.game_with_strategy_comments_on(rounds=10)

round: 0
user cards: 10  -  8
upcard: 4
-----------------------------------------------
STATE: 18
state: 18  - flag: False
------------ STATE---------- 18
Action taken:  0 

YOU WON, bust  <  18
____________ STATE_________ 18
round: 1
user cards: 3  -  9
upcard: 7
-----------------------------------------------
STATE: 12
state: 12  - flag: False
------------ STATE---------- 12
Action taken:  1 

current State 12
current State Value: 12
current State AceFlag: False
NEW CARD: 9
_____     _____
NEW STATE: 21
-----------------------------------------------
STATE: 21
state: 21  - flag: False
------------ STATE---------- 21
Action taken:  1 

current State 21
current State Value: 21
current State AceFlag: False
NEW CARD: 4
_____     _____
NEW STATE: bust
-----------------------------------------------
------------ STATE---------- bust
Action taken:  0 

YOU BUSTED
____________ STATE_________ bust
round: 2
user cards: A  -  10
upcard: 10
-----------------------------------------------
STATE: 