In [28]:
import pyspiel
import numpy as np
import gym, gym.spaces
import math
import random
import numpy as np
from collections import namedtuple, deque
from itertools import count
from abc import ABC, abstractmethod

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
%reload_ext cython
# import line_profiler
# %load_ext line_profiler

from BridgeNetwork import *

%%cython -f --compile-args=-DCYTHON_TRACE=1
# cython: linetrace=True


%%cython

In [29]:
import pyspiel
import numpy as np
import gym, gym.spaces
import math
import random
import numpy as np
from collections import namedtuple, deque
from itertools import count
from abc import ABC, abstractmethod

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from BridgeNetwork import *
GAME = pyspiel.load_game('bridge(use_double_dummy_result=true)')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Adversary(ABC):
    def getAction(observation):
        raise NotImplemented

class RandomAdversary(Adversary):
    def __init__(self, possible_actions):
        self.possible_actions = possible_actions
    def get_action(self, observation):
        return np.random.rand(self.possible_actions)

class PolicyNetAdversary(Adversary):
    def __init__(self, policy_net):
        self.policy_net = policy_net
    
    def get_action(self, observation):
        with torch.no_grad():
            self.policy_net.eval()
            observation = torch.from_numpy(observation).to(device).float().unsqueeze(0)
            return self.policy_net(observation).cpu().numpy()


class WeightedRandomSelectedAdversary(Adversary):
    def __init__(self, adversaries, weights = None) -> None:
        self.adversaries = adversaries
        self.weights = weights

        if self.weights == None:
            self.weights = np.full(len(self.adversaries), 1/len(self.adversaries))  
    
    def get_action(self, observation):
        return random.choices(self.adversaries, weights=self.weights, k=1)[0].get_action(observation)
        

class AdversarialBridgeEnv(gym.Env):
    """Custom Environment that follows gym interface"""

    def __init__(self, adversary, adversary_plays_first = False):
        super(AdversarialBridgeEnv, self).__init__()    # Define action and observation space
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(38,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(571,), dtype=np.float32)
        self.adversary = adversary
        self.adversary_plays_first = adversary_plays_first
        self.reset(adversary_plays_first = adversary_plays_first)

    def reset(self, adversary = None, adversary_plays_first = False):
        self.adversary = adversary if adversary != None else self.adversary
        self.adversary_plays_first = adversary_plays_first
        self.state = self.generate_random_game()
        if self.adversary_plays_first:
            self.adversary_step()

        return np.array(self.state.observation_tensor())

    def step(self, action_dist):
        action = self.pick_action(action_dist)
        self.state.apply_action(action+52)
        
        if self.state.current_phase() == 3:
            return self.calculate_terminal_reward(action)
        
        # opposing team action
        self.adversary_step()

        if self.state.current_phase() == 3:
            return self.calculate_terminal_reward(action)

        return self.calculate_default_reward(action)

    def adversary_step(self):
        self.state.apply_action(
            self.pick_action(
                self.adversary.get_action(np.array(self.state.observation_tensor()))
            ) + 52
        )

    def calculate_default_reward(self, action):
        obs = np.array(self.state.observation_tensor())
        reward = 0
        done = False
        return obs, reward, done, {"action": action}

    def calculate_terminal_reward(self, action):
        obs = np.zeros(571)
        all_possible_contract_rewards = self.state.score_by_contract()
        reward = all_possible_contract_rewards[self.state.contract_index()]
        max_possible_reward = max(all_possible_contract_rewards)
        min_possible_reward = min(all_possible_contract_rewards)
        reward = (reward-min_possible_reward)/(max_possible_reward-min_possible_reward)
        # bidding ends on the winning-bidder's turn, since the bid is won after 3 passes and it becomes the 4th player's turn again
        # if the adversary goes first, the adversary won the bid if the current player is 0 or 2, else 1 or 3
        if self.state.current_player() in ({0,2} if self.adversary_plays_first else {1,3}):
            # Reward was calculated from adversary's perspective
            reward = 1-reward
        done = True
        return obs, reward, done, {"action": action}

    def pick_action(self, action_vector):
        action_vector = self.softmax(action_vector)
        legal_action_mask = np.array(self.state.legal_actions_mask())[52:52+self.action_space.shape[0]]
        masked_action_vector = action_vector*legal_action_mask
        action = np.argmax(masked_action_vector)

        return action


    def softmax(self, x):
        y = np.exp(x - np.max(x))
        f_x = y / np.sum(y)
        return f_x

    def generate_random_game(self): 
        state = GAME.new_initial_state()
        # deal all 52 cards randomly
        for i in np.random.choice(52, size=(52,), replace=False):
            state.apply_action(i)
        return state

class MultipleSimulationRewardEnv(AdversarialBridgeEnv):

    def reset(self, adversary = None, adversary_plays_first = False):
        self.adversary = adversary if adversary != None else self.adversary
        self.adversary_plays_first = adversary_plays_first
        self.games = self.generate_random_games()
        self.state = self.games[0]
        if self.adversary_plays_first:
            self.adversary_step()

        return np.array(self.state.observation_tensor())

    def calculate_terminal_reward(self, action):
        obs = np.zeros(571)
        gen = list((np.array(state.score_by_contract()) for state in self.games))
        all_state_contracts = np.stack(gen)
        all_possible_contract_rewards = np.mean(all_state_contracts, axis=0)
        reward = all_possible_contract_rewards[self.state.contract_index()]
        # max_possible_reward = max(all_possible_contract_rewards)
        # min_possible_reward = min(all_possible_contract_rewards)
        # reward = (reward-min_possible_reward)/(max_possible_reward-min_possible_reward)
        # bidding ends on the winning-bidder's turn, since the bid is won after 3 passes and it becomes the 4th player's turn again
        # if the adversary goes first, the adversary won the bid if the current player is 0 or 2, else 1 or 3
        if self.state.current_player() in ({0,2} if self.adversary_plays_first else {1,3}):
            # Reward was calculated from adversary's perspective
            reward = -reward
        done = True
        return obs, reward, done, {"action": action}
        
    def generate_random_games(self, n_games=5): 
        games = [GAME.new_initial_state() for i in range(n_games)]

        random_deal = np.random.choice(52, size=(52,), replace=False)

        player_hand = random_deal[:26]
        opponent_cards =  random_deal[26:]
        opponent_hands = [np.random.permutation(opponent_cards) for i in range(n_games)]

        # deal all 52 cards randomly
        deal_order = np.empty(52, dtype=int)
        for game, opponent_hand in zip(games,opponent_hands):
            if (self.adversary_plays_first): 
                deal_order[0::2] = opponent_hand
                deal_order[1::2] = player_hand
            else:
                deal_order[0::2] = player_hand
                deal_order[1::2] = opponent_hand
            for i in range(52):
                game.apply_action(deal_order[i])
        return games
    
class RewardForPredictingTeamateHandEnv(MultipleSimulationRewardEnv):

    def step(self, action_dist, teamate_hand_prediction):
        action = self.pick_action(action_dist)
        self.state.apply_action(action+52)
        
        if self.state.current_phase() == 3:
            return self.calculate_terminal_reward(action, teamate_hand_prediction)
        
        # opposing team action
        self.adversary_step()

        if self.state.current_phase() == 3:
            return self.calculate_terminal_reward(action, teamate_hand_prediction)

        return self.calculate_default_reward(action, teamate_hand_prediction)


    def calculate_default_reward(self, action, teamate_hand_prediction):
        obs = np.array(self.state.observation_tensor())

        teamate_hand = self.teamate0_hand if self.state.current_player() == ({0,1} if self.adversary_plays_first else {3,0}) else  self.teamate1_hand

        teamate_hand_prediction_reward = 10*(1 - np.mean(np.square(teamate_hand_prediction - teamate_hand)))

        reward = teamate_hand_prediction_reward
        done = False

        return obs, reward, done, {"action": action, "hand_predict_reward": teamate_hand_prediction_reward, "hand_actual": teamate_hand}

    def calculate_terminal_reward(self, action, teamate_hand_prediction):
        obs = np.zeros(571)
        gen = list((np.array(state.score_by_contract()) for state in self.games))
        all_state_contracts = np.stack(gen)
        all_possible_contract_rewards = np.mean(all_state_contracts, axis=0)
        action_reward = all_possible_contract_rewards[self.state.contract_index()]
        # max_possible_reward = max(all_possible_contract_rewards)
        # min_possible_reward = min(all_possible_contract_rewards)
        # reward = (reward-min_possible_reward)/(max_possible_reward-min_possible_reward)
        # bidding ends on the winning-bidder's turn, since the bid is won after 3 passes and it becomes the 4th player's turn again
        # if the adversary goes first, the adversary won the bid if the current player is 0 or 2, else 1 or 3
        if self.state.current_player() in ({0,2} if self.adversary_plays_first else {1,3}):
            # Reward was calculated from adversary's perspective
            action_reward = -action_reward

        teamate_hand = self.teamate0_hand if self.state.current_player() in ({0,1} if self.adversary_plays_first else {3,0}) else  self.teamate1_hand

        teamate_hand_prediction_reward = 10*(1 - np.mean(np.square(teamate_hand_prediction - teamate_hand)))

        reward = action_reward + teamate_hand_prediction_reward

        done = True
        return obs, reward, done, {"action": action, "hand_predict_reward": teamate_hand_prediction_reward, "hand_actual": teamate_hand}

    def generate_random_games(self, n_games=5): 
        games = [GAME.new_initial_state() for i in range(n_games)]

        random_deal = np.random.choice(52, size=(52,), replace=False)

        player_hand = random_deal[:26]
        opponent_cards =  random_deal[26:]
        opponent_hands = [np.random.permutation(opponent_cards) for i in range(n_games)]

        self.teamate0_hand = np.zeros(52)
        self.teamate0_hand[player_hand[0::2]] = 1 # set indices of player 0's cards to 1
        self.teamate1_hand = np.zeros(52)
        self.teamate1_hand[player_hand[1::2]] = 1

        # deal all 52 cards randomly
        deal_order = np.empty(52, dtype=int)
        for game, opponent_hand in zip(games,opponent_hands):
            if (self.adversary_plays_first): 
                deal_order[0::2] = opponent_hand
                deal_order[1::2] = player_hand
            else:
                deal_order[0::2] = player_hand
                deal_order[1::2] = opponent_hand
            for i in range(52):
                game.apply_action(deal_order[i])
        return games

In [30]:
Transition = namedtuple('Transition',
                        ('state', 'action','hand_prediction', 'hand_actual', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [31]:
def select_action(state, env):
    global steps_done
    global ucb_action_picked_counter

    with torch.no_grad():
        ucb_term = UCB_CONFIDENCE*(np.sqrt(np.log(steps_done)/ucb_action_picked_counter))
        steps_done += 1
        policy_net.eval()

        action_vector, hand_prediction = policy_net(state)
        return action_vector.cpu()+ucb_term, hand_prediction.cpu()

In [32]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
                                                
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    prediction_batch = torch.cat(batch.hand_prediction)
    actual_batch = torch.cat(batch.hand_actual)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    action_vector, prediction = policy_net(state_batch)
    state_action_values = action_vector.gather(1, action_batch.type(torch.int64).to(device).unsqueeze(1)).squeeze()

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    target_net.train()
    target_action_vector, target_prediction = target_net(non_final_next_states)
    next_state_values[non_final_mask] = target_action_vector.max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    print(prediction_batch)
    print()
    print(actual_batch)
    loss = criterion(state_action_values, expected_state_action_values) + criterion(prediction_batch, actual_batch)

    print(loss)
    # Optimize the model    
    optimizer.zero_grad()
    loss.backward()
    # for param in policy_net.parameters():
    #     param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss

In [33]:
BATCH_SIZE = 32
GAMMA = 1

TARGET_UPDATE = 20

ADVERSARY_RANDOMNESS_DECAY = 0.90
ADVERSARY_RANDOMNESS = 1.0
ADVERSARY_UPDATE = 100

UCB_CONFIDENCE = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
envs = []

# Get number of actions from gym action space

policy_net = BridgeSupervisedWithHandPrediction().to(device)
target_net = BridgeSupervisedWithHandPrediction().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)


steps_done = 1
ucb_action_picked_counter=np.ones((38))

trailing_avg_reward = deque()
trailing_avg_size = 100

In [34]:
def play_sample_game():
    env = envs[-1]
    # Initialize the environment and state
    obs = env.reset(adversary_plays_first=True)
    obs = torch.from_numpy(obs).to(device).float().unsqueeze(0)
    for t in count():
        # Select and perform an action
        action, hand_prediction = select_action(obs, env)
        new_obs, reward, done, metadata = env.step(np.array(action.cpu()), np.array(hand_prediction.cpu()))
        new_obs = torch.from_numpy(new_obs).to(device).float().unsqueeze(0)
        reward = torch.tensor([reward], device=device)
        
        # Move to the next state
        obs = new_obs
        if done:
            print(env.state)
            break

In [35]:
from IPython.display import clear_output
def run_training(num_episodes = 10000):
    global envs
    global ucb_action_picked_counter

    for i_episode in range(num_episodes):
        if i_episode % 100 == 0:
            clear_output(wait=True)
            
            adversary_randomness = ADVERSARY_RANDOMNESS * ADVERSARY_RANDOMNESS_DECAY ** (i_episode // 100)
            envs.append(
                RewardForPredictingTeamateHandEnv(
                    WeightedRandomSelectedAdversary((
                        RandomAdversary(38), 
                        PolicyNetAdversary(target_net)), [adversary_randomness, 1-adversary_randomness])))
            envs = envs[-10:]

            play_sample_game()

        env = random.choice(envs)
        # Initialize the environment and state
        obs = env.reset(adversary_plays_first=random.random() > 0.5)
        obs = torch.from_numpy(obs).to(device).float().unsqueeze(0)
        for t in count():
            # Select and perform an action
            action, hand_prediction = select_action(obs, env)
            new_obs, reward, done, metadata = env.step(np.array(action.cpu()), np.array(hand_prediction.cpu()))
            new_obs = torch.from_numpy(new_obs).to(device).float().unsqueeze(0)
            reward = torch.tensor([reward], device=device)

            # Store the transition in memory
            memory.push(obs, torch.Tensor(np.array([metadata["action"]], dtype=np.int64)), hand_prediction, torch.Tensor(metadata["hand_actual"].reshape((1,-1))), new_obs, reward)

            # update UCB action counter
            ucb_action_picked_counter[metadata["action"]] += 1

            # Move to the next state
            obs = new_obs

            # Perform one step of the optimization (on the policy network)
            loss = optimize_model()
            if done:
                
                trailing_avg_reward.append(reward[0].cpu().numpy())
                if len(trailing_avg_reward) > trailing_avg_size:
                    trailing_avg_reward.popleft()
                
                print(f"episode #{i_episode}, episode reward: {round(np.sum(np.array(reward[0].cpu())),2)}, avg_reward: {round(np.mean(trailing_avg_reward),2)}, episode length: {t+1}, loss: {loss}, predict_reward: {round(metadata['hand_predict_reward'],2)}")
                # print(env.state)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

    print('Complete')



In [36]:

run_training()
# %lprun -f MultipleSimulationRewardEnv.generate_random_games run_training(10)

Vul: None
        S AT2
        H KQT7
        D 64
        C J654
S 5             S J93
H J92           H A85
D AJ9852        D KQ7
C 873           C AKQ2
        S KQ8764
        H 643
        D T3
        C T9

West  North East  South
      1C    3N    6C    
6H    6N    7H    7N    
Pass  Pass  Dbl   RDbl  
Pass  Pass  Pass  

Declarer tricks: 3
Score: N/S -5200 E/W 5200
episode #0, episode reward: -2192.49, avg_reward: -2192.49, episode length: 7, loss: None, predict_reward: 7.51
episode #1, episode reward: -3992.46, avg_reward: -3092.48, episode length: 6, loss: None, predict_reward: 7.54
episode #2, episode reward: 367.52, avg_reward: -1939.14, episode length: 5, loss: None, predict_reward: 7.52
episode #3, episode reward: 1047.58, avg_reward: -1192.46, episode length: 6, loss: None, predict_reward: 7.58
episode #4, episode reward: 437.54, avg_reward: -866.46, episode length: 7, loss: None, predict_reward: 7.54
tensor([[ 0.0082,  0.0330, -0.0044,  ..., -0.0485, -0.1048,  0.0482]

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [None]:
import cProfile
with  cProfile.Profile() as pr:
    run_training(10)
pr.print_stats(sort='cumtime')


In [None]:
def play_sample_game():
    env = envs[-1]
    # Initialize the environment and state
    obs = env.reset(adversary_plays_first=False)
    obs = torch.from_numpy(obs).to(device).float().unsqueeze(0)
    for t in count():
        # Select and perform an action
        action = select_action(obs, env)
        new_obs, reward, done, metadata = env.step(np.array(action.cpu()))
        new_obs = torch.from_numpy(new_obs).to(device).float().unsqueeze(0)
        reward = torch.tensor([reward], device=device)
        
        # Move to the next state
        obs = new_obs
        if done:
            print(env.state)
            break

play_sample_game()

In [None]:
ucb_action_picked_counter

array([1559.,  525.,  416.,   88.,   88.,   88.,   88.,   88.,   88.,
         88.,   87.,   87.,   87.,   87.,   87.,   87.,   88.,   87.,
         87.,   88.,   87.,   88.,   88.,   95.,   95.,   95.,  105.,
        119.,  126.,  138.,  139.,  149.,  189.,  199.,  268.,  323.,
        401.,  533.])