In [3]:
import numpy as np
import random

In [4]:
class Easy21():
    def __init__(self):
        self.min_value, self.max_value = 1, 10
        self.dealer_lowerbound = 16
        self.hand_lowerbound, self.hand_upperbound = 0, 21
    
    def start_game(self):
        return (np.random.randint(self.min_value, self.max_value+1),
                np.random.randint(self.min_value, self.max_value+1))
    
    def reset_game(self):
        return (np.random.randint(self.min_value, self.max_value+1),
                np.random.randint(self.min_value, self.max_value+1))
    
    def draw(self):
        card_value = np.random.randint(self.min_value, self.max_value+1)
        if np.random.random_sample() <= 2/5:
            return -card_value
        else:
            return card_value
        
    def step(self, player_value, dealer_value, action, terminate_turn):
        if action == 0: # hit
            player_value += self.draw()
            if self.hand_lowerbound < player_value <= self.hand_upperbound:
                reward = 0
                terminate_turn = False
            else:
                reward = -1
                terminate_turn = True
        else: # stick
            terminate_turn = True
            while self.hand_lowerbound < dealer_value < self.dealer_lowerbound:
                dealer_value += self.draw()
            if not self.hand_lowerbound < dealer_value <= self.hand_upperbound:
                reward = 1
            elif player_value > dealer_value:
                reward = 1
            elif player_value < dealer_value:
                reward = -1
            else: # player_value == dealer_value
                reward = 0
        return player_value, dealer_value, reward, terminate_turn

In [5]:
class QLearner():
    def __init__(self, easy21, total_episodes, epsilon, gamma, alpha):
        self.easy21 = easy21
        self.qtable = np.zeros((22, 11, 2))
        self.num_state_action = np.zeros((22, 11, 2))
        self.num_state = lambda p, d: np.sum(self.num_state_action[p, d])
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.total_episodes = total_episodes
        self.wins = 0
    
    def epsilon_greedy(self, player_value, dealer_value):
        if np.random.random() < self.epsilon:
            action = random.randint(0, 1)
        else:
            action = np.argmax([self.qtable[player_value, dealer_value, action] for action in (0, 1)])
        return action
    
    def train(self):
        episode_print_split = self.total_episodes // 5
        for episode in range(total_episodes):
            terminate_turn = False
            state_action_reward = []
            player_value, dealer_value = self.easy21.start_game()
            
            while terminate_turn == False:
                action = self.epsilon_greedy(player_value, dealer_value)
                self.num_state_action[player_value, dealer_value, action] += 1
                player_value_new, dealer_value_new, reward, terminate_turn = self.easy21.step(player_value, dealer_value,
                                                                                              action, terminate_turn)
                state_action_reward.append([player_value, dealer_value, action, reward])
                player_value, dealer_value = player_value_new, dealer_value_new
            
            self.easy21.reset_game()
            
            total_rewards = sum(sar[-1] for sar in state_action_reward)
            
            for (p, d, a, r) in state_action_reward:
                self.qtable[p, d, a] += self.alpha * (self.gamma - self.qtable[p, d, a])
            
            if reward == 1:
                self.wins += 1
            if (episode + 1) % episode_print_split == 0:
                print('training episode %i | win percentage %.2f'%(episode, self.wins / (episode + 1)))

In [6]:
total_episodes = 10000
epsilon = 0.9
gamma = 0.95
alpha = 0.8

easy21 = Easy21()
q_learner = QLearner(easy21, total_episodes, epsilon, gamma, alpha)
q_learner.train()

training episode 1999 | win percentage 0.47
training episode 3999 | win percentage 0.47
training episode 5999 | win percentage 0.45
training episode 7999 | win percentage 0.45
training episode 9999 | win percentage 0.45


In [7]:
class ApproxQLearner():
    def __init__(self, easy21, total_episodes, epsilon, gamma, alpha):
        self.easy21 = easy21
        self.qtable = np.zeros((22, 11, 2))
        self.num_state_action = np.zeros((22, 11, 2))
        self.num_state = lambda p, d: np.sum(self.num_state_action[p, d])
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.total_episodes = total_episodes
        self.wins = 0
        self.intervals = {
            'dealer': ((1, 4), (4, 7), (7, 10)),
            'player': ((1, 6), (4, 9), (7, 12), (10, 15), (13, 18), (16, 21)),
            'action': (0, 1)
        }
        self.feature_shape = tuple(len(self.intervals[key]) for key in ('dealer', 'player', 'action'))
        self.weights = (np.random.rand(*self.feature_shape) - 0.5) * 0.001

    def weights_function(self, player_value, dealer_value, action):
        state_features = np.array([(d[0] <= dealer_value <= d[1]) and (p[0] <= player_value <= p[1])
                             for d in self.intervals['dealer']
                             for p in self.intervals['player']]).astype(int).reshape(self.feature_shape[:2])
        features = np.zeros(self.feature_shape)
        if action == 1:
            features[:, :, 0] = state_features
        return features.astype(int)
    
    def epsilon_greedy(self, player_value, dealer_value):
        if np.random.random() < self.epsilon:
            action = random.randint(0, 1)
            q_hat = np.sum(self.weights_function(player_value, dealer_value, action) * self.weights)
        else:
            q_hat, action = max(((np.sum(self.weights_function(player_value, dealer_value, a) * self.weights), a) for a in (0, 1)), key=lambda x: x[0])
        return q_hat, action

    def train(self):
        episode_print_split = self.total_episodes // 5
        #mean_reward = 0
        for episode in range(total_episodes):
            terminate_turn = False
            state_action_reward = []
            player_value, dealer_value = self.easy21.start_game()
            dealer_first_value = dealer_value
            weight_table = []
           
            while terminate_turn == False:
                q_hat, action = self.epsilon_greedy(player_value, dealer_value)
                self.num_state_action[player_value, dealer_value, action] += 1
                player_value_new, dealer_value_new, reward, terminate_turn = self.easy21.step(player_value, dealer_value,
                                                                                              action, terminate_turn)
                state_action_reward.append([player_value, dealer_value, action, reward])
                q_hat1, action1 = self.epsilon_greedy(player_value_new, dealer_value_new)
                features = self.weights_function(player_value_new, dealer_value_new, action1)
                delta_weights = self.alpha * (reward + self.gamma * q_hat1 - q_hat) * (self.gamma * self.weights + features)
                self.weights += delta_weights
                player_value, dealer_value = player_value_new, dealer_value_new
            
            self.easy21.reset_game()
            
            total_rewards = sum(sar[-1] for sar in state_action_reward)
            
            for (p, d, a, r) in state_action_reward:
                self.qtable[p, d, a] += self.alpha * (self.gamma - self.qtable[p, d, a])

            if reward == 1:
                self.wins += 1
            if (episode + 1) % episode_print_split == 0:
                print('training episode %i | win percentage %.2f'%(episode, self.wins / (episode + 1)))

In [8]:
total_episodes = 10000
epsilon = 0.9
gamma = 0.95
alpha = 0.8

easy21 = Easy21()
approx_q_learner = ApproxQLearner(easy21, total_episodes, epsilon, gamma, alpha)
approx_q_learner.train()

training episode 1999 | win percentage 0.44
training episode 3999 | win percentage 0.43
training episode 5999 | win percentage 0.43
training episode 7999 | win percentage 0.44
training episode 9999 | win percentage 0.44
