In [None]:
from random import randrange

class BlackJack:
    
    cards = 4 * ([ i for i in range(1, 10)] + [10, 10, 10, 10])
    def __init__(self):
        self.reward = 0
        self.done = False
        class ActionSpace:
            def sample(self):
                return randrange(self.n)
        self.action_space = ActionSpace()
        self.action_space.n = 2
        self.force_log = False
    
    def close(self):
        pass
    
    def render(self):
        self.force_log = True
        if self.force_log:
            log = True
    
    @classmethod
    def draw_card(cls):
        return cls.cards[randrange(52)] # in real card games as has prob p(as) 4/52 p(10)=16/52 and the rest 4/52
    
    def start_game(self, log = False):
        if self.force_log:
            log = True
        return self.distribute(log)
    
    def reset(self):
        state, _, _, _ = self.distribute()
        return state
    
    def distribute(self, log = False):
        if self.force_log:
            log = True
        self.done = False
        self.reward = 0
        self.dealer_cards = [BlackJack.draw_card(), BlackJack.draw_card()]
        self.player_cards = [BlackJack.draw_card(), BlackJack.draw_card()]
        if log: print("Dealer:", self.dealer_cards[0])
        if log: print("My cards:", self.player_cards)
        if self.my_sum() == 21:
            if self.dealer_sum() == 21:
                self.reward = 0
                self.done = True
            else:
                self.reward = 1
                self.done = True
        return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {'Dealer': self.dealer_cards, 'Player': self.player_cards}
    
    def has_usable_as(self):
        if 1 in self.player_cards:
            return 1
        return 0
    
    def step(self, action, log = False):
        if self.force_log:
            log = True
        if (action == 0):
            self.hit(log)
        elif (action == 1):
            self.stick(log)
        else:
            raise IndexError
        return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {'Dealer': self.dealer_cards, 'Player': self.player_cards}
    
    def hit(self, log = False):
        if self.force_log:
            log = True
        if self.done:
            print("Game already over")
            return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {'Dealer': self.dealer_cards, 'Player': self.player_cards}
        self.player_cards.append(BlackJack.draw_card())
        if log: print("Hit !")
        if log: print("My cards:", self.player_cards)
        if self.my_sum() > 21:
            self.reward = -1
            self.done = True
        elif self.my_sum() == 21:
            return self.stick()
        return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {'Dealer': self.dealer_cards, 'Player': self.player_cards}
        
    def stick(self, log = False):
        if self.force_log:
            log = True
        if self.done:
            print("Game already over")
            return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {'Dealer': self.dealer_cards, 'Player': self.player_cards}
        # dealer policy
        while (self.dealer_sum() < 17):
            self.dealer_cards.append(BlackJack.draw_card())     
        if log: print("Stick !")
        if log: print("Dealer score:", self.dealer_sum())
        if log: print("My sum:", self.my_sum())
        my_sum = self.my_sum()
        dealer_sum = self.dealer_sum()
        if dealer_sum > 21:
            self.reward = 1
        else:
            if my_sum == dealer_sum:
                self.reward = 0
            elif my_sum == 21:
                if dealer_sum != 21:
                    self.reward = 1
                else:
                    self.reward = 0 # both dealer and play have a natural
            elif my_sum > dealer_sum:
                self.reward = 1
            else:
                self.reward = -1
        self.done = True
        return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {'Dealer': self.dealer_cards, 'Player': self.player_cards}
        
            
    @classmethod
    def compute_sum(_, cards):
        value = sum(cards)
        nb_as = cards.count(1)
        i = 0
        while (i < nb_as and value <= 10):
            value += 10
            i += 1
        return value
    
    def my_current_sum(self):
        return sum(self.player_cards)

    def my_sum(self):
        return BlackJack.compute_sum(self.player_cards)
    
    def dealer_sum(self):
        return BlackJack.compute_sum(self.dealer_cards)
    
    def interactive_play(self, action):
        if self.done is False:
            self.step(action, True)
        else:
            print("Game over", self.reward)
            print("Dealer:", self.dealer_cards)
            print("My cards:", self.player_cards)
        return (self.dealer_cards[0], self.my_sum(), self.has_usable_as()), self.reward, self.done, {}

In [None]:
blackJack = BlackJack()
blackJack.distribute(log = True)

In [None]:
blackJack.step(1)

# Dynamic training

In [None]:
import numpy as np
# state(dealer_showing, my_sum, usable_as)
# action 0: hit, 1: stick

def basic_hit_policy(state, limit):
    if state[1] >= limit:
        return 1 #stick
    return 0

def play_policy(policy):
    # Generate episode:
    blackJack = BlackJack()
    blackJack.distribute()
    
    states = []
    if blackJack.done is False:
        states.append((blackJack.dealer_cards[0] - 1, blackJack.my_sum() - 2, blackJack.has_usable_as()))
    while blackJack.done is False:
        state = ( blackJack.dealer_cards[0] - 1, blackJack.my_sum() - 2, blackJack.has_usable_as() )
        states.append(state)
        action = policy(state)
        blackJack.step(action)
    return states, blackJack.reward

# 10 - possible dealer cards / 21 - possible sums: 2 - 21 + >21 / 2- Has Ace True or False / 2 - Possible actions
def optimal_policy(Q, state):
    return Q[state[0], state[1], state[2]].argmax()

def update_policy_scores(Q, Returns, states, game_state, policy):
    for state in states: # I can assume the states are never repeated
        # can also get the "average" reward simply by getting the end state
        action = optimal_policy(Q, state)
        if (state, action) not in Returns:
            Returns[(state,action)] = []
        Returns[(state,action)].append(game_state)
        Q[state[0], state[1], state[2], action] = sum(Returns[(state,action)])/len(Returns[(state,action)])

In [None]:
# Play a first round of the game
Q = np.zeros((10, 20, 2, 2)) 
Returns = {}

In [None]:
running_policy = lambda state: basic_hit_policy(state, 20)
states, game_state = play_policy(running_policy)
update_policy_scores(Q, Returns, states, game_state, running_policy)

running_policy = lambda state: optimal_policy(Q, state)
for i in range(500):
    states, game_state = play_policy(running_policy)
    update_policy_scores(Q, Returns, states, game_state, running_policy)

In [None]:
def play_blackJack():
    blackJack = BlackJack()
    blackJack.distribute(log = True)

    while blackJack.done is False:
        state = ( blackJack.dealer_cards[0] - 1, blackJack.my_sum() - 2, blackJack.has_usable_as() )
        print(Q[state[0],state[1],state[2]])
        action = optimal_policy(Q, state)
        # action = basic_hit_policy(state, 17)
        _, reward, _, _ = blackJack.interactive_play(action)
    return reward

total_score = 0
for i in range(1000):
    game_state = play_blackJack()
    total_score += game_state
    if game_state == 1:
        print("*** Win!***")
    elif game_state == 0:
        print("*** Draw :(***")
    else:
        print("*** Lose XD ***")
print("Total score:", total_score)

# Q-Training

In [None]:
from qlearning import *

def state_function(state):
    if state is None:
        raise IndexError
    return state
env = BlackJack() 
game = GamePlayer(env, state_function)

In [None]:
total_episodes = 500
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 5          # Exponential decay rate for exploration prob
epsilon = 0.1                 # Ehttp://localhost:8888/notebooks/Black-Jack.ipynb#xploration rate
#game.erase_training()
rewards = game.train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery = 100)
print("Total reward average:", np.mean(rewards))
print(len(game.qtable))

In [None]:
for episode in range(5):
    state = game.start_game(True)
    print("****************************************************")
    print("EPISODE ", episode)
    done = False
    tot_reward = 0
    while done is False:
    # for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        new_state, reward, done, info = game.computer_play_step(state)
        #game.play_game_step(0)
        state = new_state
        tot_reward += reward
    print("Reward:", tot_reward)
game.end_game()

In [None]:
game.qtable

# Double Q learning

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)

def state_function(state):
    if state is None:
        raise IndexError
    return state
env = BlackJack() 
game = qlearning.GamePlayer(env, state_function)

In [None]:
total_episodes = 500
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 5          # Exponential decay rate for exploration prob
epsilon = 0.1                 # Ehttp://localhost:8888/notebooks/Black-Jack.ipynb#xploration rate
#game.erase_training()
rewards = game.double_q_train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery = 100)
print("Total reward average:", np.mean(rewards))
print(len(game.qtable))

In [None]:
print(game.Q2)

In [None]:
for episode in range(5):
    state = game.start_game(True)
    print("****************************************************")
    print("EPISODE ", episode)
    done = False
    tot_reward = 0
    while done is False:
    # for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        new_state, reward, done, info = game.double_trained_computer_play_step(state)
        #game.play_game_step(0)
        state = new_state
        tot_reward += reward
    print("Reward:", tot_reward)
game.end_game()