In [71]:
import numpy as np

from collections import defaultdict
import plotting

In [72]:


# Create Blackjack environment - Model free environment
deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]

class BlackjackEnv(object):
    """Create an environment of a Black Jack Game
    dealer = list - dealer list card in his deck
    player = list - player list card in his deck
    done = bool - True if the game is done, and False otherwise
    this class only can be accessed from act(), reset(), state()
    act() = PARAMS : 1 if hit, 0 if stick
            RETURN : (state), done_status, reward
    reset() = PARAMS : None
              RETURN : None
    state() = PARAMS : None
             RETURN : (state)
    (state) is a tuple of player score, dealers score, usable ace condition
    Black Jack Refferences:
    [1] https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node51.html (Example 5.1)
    [2] http://www.bicyclecards.com/how-to-play/blackjack/
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.dealer = [self.draw()]
        self.player = [self.draw()]
        self.done = False

    def natural(self,hand): # check if he got natural/blackjack condition
        return sorted(hand)==[1,10]

    def draw(self): # get one card
        return np.random.choice(deck)

    def usable(self, hand): # check if he got usable ace condition
        return 1 in hand and sum(hand) + 10 <= 21

    def busted(self, hand): # check if he got busted
        return self.sum_hand(hand) > 21

    def sum_hand(self, hand):
        if self.usable(hand):
            return sum(hand) + 10
        else:
            return sum(hand)

    def state(self):
        return self.sum_hand(self.player), self.sum_hand(self.dealer), \
                    self.usable(self.player)

    def act(self, hit):
        if not self.done:
            if hit:
                self.hit()
                if self.busted(self.player):
                    self.done = True
                    return self.state(), self.done, -1
                else:
                    return self.state(), self.done, 0
            else:
                return self.stick()

    def hit(self):
        self.player.append(self.draw())

    def stick(self):
        self.done = True

        # Dealer doing hit while his score below 17
        # see refference [2]
        while self.sum_hand(self.dealer) < 17:
            self.dealer.append(self.draw())

        # player'll never get busted in here so just sum it
        player_score = self.sum_hand(self.player)

        dealer_score = -1 if self.busted(self.dealer) else self.sum_hand(self.dealer)

        if self.natural(self.player) and self.natural(self.dealer):
            reward = 1
        elif self.natural(self.player):
            reward = 1.5
        elif dealer_score > player_score:
            reward = -1
        elif dealer_score < player_score:
            reward = 1
        else:
            reward = 0

        return self.state(), self.done, reward

def print_state(recvdState):
    player_sum, dealer_sum, usable_ace = recvdState
    print("Player sum : {}, Dealer sum: {}, Usable Ace : {}".format(player_sum, dealer_sum, usable_ace))
    
def take_action(recvdState):
    player_sum, dealer_sum, usable_ace = recvdState
    if int(player_sum) >= 20: 
        return 0  
    else: 
        return 1 # 1 = hit, 0 = stick
        

In [73]:
env = BlackjackEnv()
env.reset()
init_state = env.state()

In [74]:
# State here represents the tuple returned by the environment when an agent reaches there
# state returns - score of the player, score of the dealer, usable ace or not 
# Usable ace - an ace where its value can be considered to be 11
# print(init_state)

In [75]:
# Lets play a test game 
for episodes in range(20):
    print("New Game Starting!")
    env.reset()
    new_state = env.state()
    for one_step in range(100):
        print_state(new_state)
        action = take_action(new_state)
        print("Taking action: {}".format( ["Stick", "Hit"][action]))
        new_state_vals = env.act(action)
        new_state = new_state_vals[0]
        #print(new_state_vals)
        
        if new_state_vals[1] == True:
            print_state(new_state)
            print("End of game, reward is {}".format(new_state_vals[2]))
            break
        

New Game Starting!
Player sum : 10, Dealer sum: 7, Usable Ace : False
Taking action: Hit
Player sum : 20, Dealer sum: 7, Usable Ace : False
Taking action: Stick
Player sum : 20, Dealer sum: 17, Usable Ace : False
End of game, reward is 1
New Game Starting!
Player sum : 8, Dealer sum: 8, Usable Ace : False
Taking action: Hit
Player sum : 12, Dealer sum: 8, Usable Ace : False
Taking action: Hit
Player sum : 22, Dealer sum: 8, Usable Ace : False
End of game, reward is -1
New Game Starting!
Player sum : 8, Dealer sum: 2, Usable Ace : False
Taking action: Hit
Player sum : 10, Dealer sum: 2, Usable Ace : False
Taking action: Hit
Player sum : 18, Dealer sum: 2, Usable Ace : False
Taking action: Hit
Player sum : 24, Dealer sum: 2, Usable Ace : False
End of game, reward is -1
New Game Starting!
Player sum : 10, Dealer sum: 4, Usable Ace : False
Taking action: Hit
Player sum : 20, Dealer sum: 4, Usable Ace : False
Taking action: Stick
Player sum : 20, Dealer sum: 22, Usable Ace : False
End of ga

In [107]:
def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    V = defaultdict(float)
    states = []
    rewards = []
    state_count = []
    episode_state_list = []
    sum_rewards = defaultdict(float)
    count_states = defaultdict(float)
    G = 0

    for i in range(num_episodes):
        if (i%1000 == 0):
            print("iteration number : ", i)
        # Start of a new episode
        #print("Start of a new episode")
        env.reset()
        new_state = env.state()
        states.append(new_state[0])

        #'''
        for st in range(100): #Every episode has 100 steps
            #print_state(new_state)
            action = policy(new_state)
            #print("Taking action: {}".format( ["Stick", "Hit"][action]))
            new_state_vals = env.act(action)
            new_state = new_state_vals[0]
            #print_state(new_state)
            states.append(new_state[0])
            episode_state_list.append([new_state_vals[0], action, new_state_vals[2]])
            #print(new_state_vals)

            if new_state_vals[1] == True:
                #rewards.append(new_state_vals[2])
                #print_state(new_state)
                #print("End of game, reward is {}".format(new_state_vals[2]))
                break

            # Find all the unique states for this episode (this helps to find the first time a state appears in an episode)
            #print("states : ", states)
            unique_states = set(states)
            #print("unique_states : ",unique_states)
        #'''
        #print("episode_state_list", episode_state_list)
        for state in unique_states:
            for i, episode in enumerate(episode_state_list):
                 if(episode[0][0] == state):
                     #print("Found the first occurence of state : ", state, i, episode)
                     #print("Printing all episodes from the i'th episode",episode_state_list[i:])
                     for j in episode_state_list[i:]:
                         #print(j[2])
                         x = j[2]
                         G += x #* (1**i))  # 1 is the discount factor
            sum_rewards[state] += G
            count_states[state] += 1
            V[state] = sum_rewards[state] / count_states[state]
    return(V)


In [108]:
def sample_policy(recvdState):
    player_sum, dealer_sum, usable_ace = recvdState
    if int(player_sum) >= 20: 
        return 0  
    else: 
        return 1 # 1 = hit, 0 = stick


In [110]:
env = BlackjackEnv()
V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
print(V_10k)
#plotting.plot_value_function(V_10k, title="10,000 Steps")

#V_500k = mc_prediction(sample_policy, env, num_episodes=500)
#plotting.plot_value_function(V_500k, title="500,000 Steps")

KeyboardInterrupt: 